Python Data Science with Pandas & NumPy
Python is the undisputed champion of data science. With powerful libraries like Pandas and NumPy, you can analyze, manipulate, and visualize data like a professional data scientist.
What is Data Science?
Data science combines statistics, programming, and domain expertise to extract insights from data. Python’s ecosystem makes it the perfect language for data science workflows.
Why Python for Data Science?
- Rich Libraries: NumPy, Pandas, Matplotlib, Scikit-learn
- Easy Syntax: Readable code for complex operations
- Community Support: Extensive documentation and tutorials
- Integration Ready: Works with Jupyter notebooks
- Industry Standard: Used by data scientists worldwide
NumPy Fundamentals
Installation
pip install numpyNumPy Arrays
import numpy as np
# Create arrays
arr = np.array([1, 2, 3, 4, 5])
matrix = np.array([[1, 2], [3, 4]])
print(arr) # [1 2 3 4 5]
print(matrix) # [[1 2] [3 4]]
print(arr.dtype) # int64
print(arr.shape) # (5,) - 1D array
print(matrix.shape) # (2, 2) - 2D arrayArray Operations
import numpy as np
arr = np.array([10, 20, 30, 40, 50])
# Basic arithmetic
print(arr + 5) # [15 25 35 45 55]
print(arr * 2) # [20 40 60 80 100]
print(arr - 5) # [5 15 25 35 45]
# Statistical operations
print(np.mean(arr)) # 30.0
print(np.median(arr)) # 30.0
print(np.std(arr)) # 14.14213562
print(np.sum(arr)) # 150
print(np.max(arr)) # 50
print(np.min(arr)) # 10Array Creation Methods
import numpy as np
# Different ways to create arrays
zeros = np.zeros(5) # [0. 0. 0. 0. 0.]
ones = np.ones(3) # [1. 1. 1.]
full = np.full(4, 7) # [7. 7. 7. 7.]
range_arr = np.arange(0, 10, 2) # [0 2 4 6 8]
linspace = np.linspace(0, 1, 5) # [0. 0.25 0.5 0.75 1.]
random_arr = np.random.rand(3) # Random values between 0-1
# Array reshaping
arr = np.arange(12)
reshaped = arr.reshape(3, 4) # 3x4 matrix
reshaped_back = reshaped.reshape(-1) # Back to 1D array (12 elements)Mathematical Functions
import numpy as np
# Trigonometric
angles = np.array([0, np.pi/4, np.pi/2])
sin_vals = np.sin(angles)
cos_vals = np.cos(angles)
tan_vals = np.tan(angles)
# Exponential and logarithmic
exp_vals = np.exp([1, 2, 3]) # [e^1, e^2, e^3]
log_vals = np.log([1, np.e, 10]) # [0, 1, 2.302585]
# Linear algebra operations
A = np.array([[1, 2], [3, 4]])
B = np.array([[5, 6], [7, 8]])
C = A + B # Element-wise addition
D = A * B # Matrix multiplication
E = np.dot(A, B) # Dot product
F = np.transpose(A) # Transpose matrix
print("Matrix addition:\n", C)
print("Matrix multiplication:\n", D)
print("Dot product:\n", E)Pandas Fundamentals
Installation
pip install pandasPandas Series
import pandas as pd
import numpy as np
# Create Series from list
data = [10, 20, 30, 40, 50]
series = pd.Series(data, name='numbers')
print(series)
# 0 10
# 1 20
# 2 30
# 3 40
# 4 50
# Name: numbers, dtype: int64
# Create Series with custom index
indices = ['a', 'b', 'c', 'd', 'e']
series_with_index = pd.Series([100, 200, 300, 400, 500], index=indices)
print(series_with_index)
# a 100
# b 200
# c 300
# d 400
# e 500Pandas DataFrame
import pandas as pd
import numpy as np
# Create DataFrame from dictionary
data = {
'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
'age': [25, 30, 35, 40, 28],
'city': ['New York', 'Los Angeles', 'Chicago', 'Boston', 'Seattle'],
'salary': [50000, 60000, 70000, 80000, 55000]
}
df = pd.DataFrame(data)
print(df)
# name age city salary
# 0 Alice 25 New York 50000
# 1 Bob 30 Los Angeles 60000
# 2 Charlie 35 Chicago 70000
# 3 David 40 Boston 80000
# 4 Eva 28 Seattle 55000
print(df.dtypes)
# name object
# age int64
# city object
# salary int64
print(df.shape) # (5, 4) - 5 rows, 4 columnsData Import and Export
import pandas as pd
# Read CSV file
df = pd.read_csv('data.csv')
print(df.head())
# Read Excel file
df_excel = pd.read_excel('data.xlsx')
print(df_excel.head())
# Read JSON file
df_json = pd.read_json('data.json')
print(df_json.head())
# Export to CSV
df.to_csv('output.csv', index=False)
# Export to Excel
df.to_excel('output.xlsx', index=False)
# Export to JSON
df.to_json('output.json', orient='records')Data Analysis with Pandas
Basic Data Exploration
import pandas as pd
import numpy as np
# Load sample data
data = {
'product': ['Laptop', 'Mouse', 'Keyboard', 'Monitor'],
'price': [999, 29, 79, 299],
'category': ['Electronics', 'Electronics', 'Electronics', 'Electronics'],
'rating': [4.5, 4.2, 4.7, 4.8],
'stock': [50, 200, 150, 75]
}
df = pd.DataFrame(data)
# Basic information
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
print("Data types:\n", df.dtypes)
print("Basic statistics:\n", df.describe())
# Check for missing values
print("Missing values per column:\n", df.isnull().sum())
# Memory usage
print("Memory usage:\n", df.info())Data Selection and Filtering
import pandas as pd
# Create sample DataFrame
df = pd.DataFrame({
'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
'age': [25, 30, 35, 40, 28],
'department': ['IT', 'HR', 'IT', 'Finance', 'IT'],
'salary': [50000, 60000, 70000, 80000, 55000],
'city': ['NYC', 'LA', 'Chicago', 'Boston', 'Seattle']
})
# Select columns
names = df['name']
ages = df[['name', 'age']]
all_columns = df.loc[:, ['name', 'age', 'city']]
# Select rows by index
first_three = df.loc[0:3]
last_three = df.loc[-3:]
# Conditional selection
high_earners = df[df['salary'] > 60000]
it_employees = df[df['department'] == 'IT']
young_it_employees = df[(df['department'] == 'IT') & (df['age'] < 35)]
print("High earners (>60k):\n", high_earners)
print("IT department employees:\n", it_employees)
print("Young IT employees (<35):\n", young_it_employees)Data Aggregation
import pandas as pd
df = pd.DataFrame({
'department': ['IT', 'HR', 'IT', 'Finance', 'IT'],
'salary': [50000, 60000, 70000, 80000, 55000],
'experience': [2, 5, 7, 10, 3]
})
# Group by department
grouped = df.groupby('department')
# Calculate statistics for each group
dept_stats = grouped.agg({
'count': ('salary', 'count'),
'mean_salary': ('salary', 'mean'),
'max_salary': ('salary', 'max'),
'min_salary': ('salary', 'min'),
'avg_experience': ('experience', 'mean')
})
print("Department statistics:\n", dept_stats)
# Multiple aggregations
complex_stats = df.groupby('department').agg({
'total_salary': ('salary', 'sum'),
'employee_count': ('salary', 'count'),
'salary_range': ('salary', lambda x: x.max() - x.min())
})
print("Complex statistics:\n", complex_stats)Data Cleaning with Pandas
import pandas as pd
import numpy as np
# Create DataFrame with missing values and duplicates
dirty_data = pd.DataFrame({
'name': ['Alice', 'Bob', None, 'David', 'Alice'],
'age': [25, 30, 35, None, 28],
'salary': [50000, None, 70000, 80000, 55000],
'email': ['alice@. com', '[email protected]', None, '[email protected]', None]
})
df = pd.DataFrame(dirty_data)
# Handle missing values
print("Original missing values:\n", df.isnull().sum())
# Drop rows with any missing values
df_clean = df.dropna()
print("After dropping rows:\n", df_clean.isnull().sum())
# Fill missing values with specific values
df_filled = df.fillna({
'name': 'Unknown',
'age': df['age'].median(),
'salary': df['salary'].mean(),
'email': '[email protected]'
})
# Remove duplicates
df_no_duplicates = df.drop_duplicates()
print("After removing duplicates:\n", len(df_no_duplicates))
# Data type conversion
df['age'] = pd.to_numeric(df['age'], errors='coerce')
df['name'] = df['name'].astype('string')
print("Final cleaned data info:\n", df.info())Data Visualization
Basic Plotting with Matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Generate sample data
np.random.seed(42)
data = {
'month': np.arange(1, 13),
'sales': np.random.randint(50, 200, 12),
'profit': np.random.randint(-20, 80, 12)
}
df = pd.DataFrame(data)
# Create subplots
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))
# Line plot
ax1.plot(df['month'], df['sales'], marker='o', linewidth=2, markersize=8)
ax1.set_title('Monthly Sales')
ax1.set_xlabel('Month')
ax1.set_ylabel('Sales')
ax1.grid(True, alpha=0.3)
# Bar chart
ax2.bar(df['month'], df['profit'], color=['red' if x < 0 else 'green' for x in df['profit']])
ax2.set_title('Monthly Profit')
ax2.set_xlabel('Month')
ax2.set_ylabel('Profit')
ax2.axhline(y=0, color='black', linestyle='--', alpha=0.5)
# Scatter plot
ax3.scatter(df['sales'], df['profit'], alpha=0.6, s=df['profit']*2)
ax3.set_title('Sales vs Profit')
ax3.set_xlabel('Sales')
ax3.set_ylabel('Profit')
# Adjust layout and save
plt.tight_layout()
plt.savefig('business_metrics.png', dpi=300, bbox_inches='tight')
plt.show()Advanced Visualizations
import seaborn as sns
import pandas as pd
import numpy as np
# Load sample dataset
df = sns.load_dataset('tips')
# Create a figure with multiple subplots
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
# Scatter plot with regression line
sns.scatterplot(data=df, x='total_bill', y='tip', ax=axes[0,0])
axes[0,0].set_title('Bill Amount vs Tip')
# Box plot for different days
sns.boxplot(data=df, x='day', y='total_bill', ax=axes[0,1])
axes[0,1].set_title('Bill Amount by Day')
# Violin plot for time vs total bill
sns.violinplot(data=df, x='time', y='total_bill', ax=axes[1,0])
axes[1,0].set_title('Bill Distribution by Time')
# Histogram of tips
sns.histplot(data=df, x='tip', ax=axes[1,1])
axes[1,1].set_title('Tip Distribution')
plt.tight_layout()
plt.savefig('restaurant_analysis.png', dpi=300, bbox_inches='tight')
plt.show()Statistical Analysis
Descriptive Statistics
import pandas as pd
import numpy as np
from scipy import stats
# Generate sample data
np.random.seed(42)
data = {
'product': ['A', 'B', 'C', 'D', 'E'],
'sales': np.random.normal(100, 20, 100),
'marketing_spend': np.random.normal(50, 10, 100),
'customer_satisfaction': np.random.uniform(1, 5, 100)
}
df = pd.DataFrame(data)
# Basic statistics
print("Mean sales:", df['sales'].mean())
print("Median sales:", df['sales'].median())
print("Sales std:", df['sales'].std())
print("Sales range:", df['sales'].max() - df['sales'].min())
# Correlation
correlation_matrix = df.corr()
print("Correlation matrix:\n", correlation_matrix)
# Correlation coefficient
correlation = df['sales'].corr(df['marketing_spend'])
print("Sales vs Marketing spend correlation:", correlation)
# Statistical test
alpha = 0.05
t_stat, p_value = stats.ttest_ind(
df['sales'][:50], # First 50 products
df['sales'][50:], # Last 50 products
)
print(f"T-statistic: {t_stat:.4f}")
print(f"P-value: {p_value:.4f}")
print("Statistically significant:", p_value < alpha)Machine Learning with Scikit-learn
Linear Regression
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
# Generate sample data
np.random.seed(42)
n_samples = 100
X = np.random.rand(n_samples, 1) * 10 # Feature: marketing spend
y = 2 * X + np.random.randn(n_samples, 1) * 2 # Target: sales
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Create and train model
model = LinearRegression()
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Evaluate model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.2f}")
print(f"Coefficient: {model.coef_[0]:.2f}")
print(f"Intercept: {model.intercept_:.2f}")
# Visualization
plt.figure(figsize=(10, 6))
plt.scatter(X_test, y_test, alpha=0.6, label='Actual')
plt.plot(X_test, y_pred, color='red', linewidth=2, label='Predicted')
plt.xlabel('Marketing Spend')
plt.ylabel('Sales')
plt.title('Linear Regression: Marketing Spend vs Sales')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()Classification Example
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
# Sample data
data = {
'study_hours': [2, 5, 1, 6, 3, 7, 4, 8, 2],
'previous_score': [65, 75, 60, 80, 70, 85, 72, 90, 68],
'passed': [0, 1, 0, 1, 0, 1, 1, 0, 1] # 0=Fail, 1=Pass
}
df = pd.DataFrame(data)
# Prepare features and target
X = df[['study_hours', 'previous_score']]
y = df['passed']
# Split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Create and train model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)
# Make predictions
y_pred = rf.predict(X_test_scaled)
# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))Practical Data Science Projects
Sales Analysis Dashboard
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Load and prepare data
df = pd.read_csv('sales_data.csv')
df['date'] = pd.to_datetime(df['date'])
df['month'] = df['date'].dt.month
df['quarter'] = df['date'].dt.quarter
# Monthly analysis
monthly_sales = df.groupby('month').agg({
'total_sales': ('amount', 'sum'),
'order_count': ('order_id', 'count'),
'avg_order_value': ('amount', 'mean')
})
# Visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
# Monthly sales trend
axes[0,0].plot(monthly_sales.index, monthly_sales['total_sales'], marker='o')
axes[0,0].set_title('Monthly Sales Trend')
axes[0,0].set_xlabel('Month')
axes[0,0].set_ylabel('Total Sales')
axes[0,0].grid(True)
# Quarterly sales comparison
quarterly_sales = df.groupby('quarter')['amount'].sum()
axes[0,1].bar(quarterly_sales.index, quarterly_sales.values)
axes[0,1].set_title('Quarterly Sales')
axes[0,1].set_xlabel('Quarter')
axes[0,1].set_ylabel('Total Sales')
# Product category performance
category_sales = df.groupby('category').agg({
'total_revenue': ('amount', 'sum'),
'order_count': ('order_id', 'count')
}).sort_values('total_revenue', ascending=False)
top_5_products = category_sales.head(5)
axes[1,0].barh(top_5_products.index, top_5_products['total_revenue'])
axes[1,0].set_title('Top 5 Products by Revenue')
axes[1,0].set_xlabel('Total Revenue')
plt.tight_layout()
plt.savefig('sales_dashboard.png', dpi=300, bbox_inches='tight')
plt.show()Customer Segmentation
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
# Generate customer data
np.random.seed(42)
n_customers = 200
customer_data = {
'customer_id': range(1, n_customers + 1),
'annual_income': np.random.lognormal(10.5, 1, n_customers),
'age': np.random.randint(18, 70, n_customers),
'spending_score': np.random.uniform(0, 100, n_customers),
'frequency': np.random.randint(1, 50, n_customers)
}
df = pd.DataFrame(customer_data)
# Select features for clustering
features = ['annual_income', 'age', 'spending_score', 'frequency']
X = df[features]
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Perform clustering
kmeans = KMeans(n_clusters=4, random_state=42)
cluster_labels = kmeans.fit_predict(X_scaled)
# Add cluster labels to dataframe
df['cluster'] = cluster_labels
# Analyze clusters
cluster_analysis = df.groupby('cluster').agg({
'customer_count': ('customer_id', 'count'),
'avg_income': ('annual_income', 'mean'),
'avg_age': ('age', 'mean'),
'avg_spending': ('spending_score', 'mean'),
'avg_frequency': ('frequency', 'mean')
})
print("Customer segments:")
for cluster_id, analysis in cluster_analysis.iterrows():
print(f"Cluster {cluster_id}: {analysis['customer_count']} customers, "
f"Avg Income: ${analysis['avg_income']:.0f}, "
f"Avg Age: {analysis['avg_age']:.0f}, "
f"Avg Spending: {analysis['avg_spending']:.0f}")
# Visualize clusters
plt.figure(figsize=(10, 6))
for i in range(4):
cluster_data = df[df['cluster'] == i]
plt.scatter(cluster_data['annual_income'], cluster_data['age'],
alpha=0.6, s=50, label=f'Cluster {i}')
plt.xlabel('Annual Income')
plt.ylabel('Age')
plt.title('Customer Segmentation')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()Best Practices
Data Science Workflow
# 1. Data Understanding
import pandas as pd
df = pd.read_csv('data.csv')
print(df.info())
print(df.describe())
print(df.head())
# 2. Data Cleaning
print(df.isnull().sum())
df_clean = df.dropna()
df_clean = df_clean.drop_duplicates()
# 3. Feature Engineering
df_clean['feature1'] = df_clean['col1'] / df_clean['col2']
df_clean['category_encoded'] = pd.get_dummies(df_clean['category'])
# 4. Model Building
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
df_clean.drop('target', axis=1),
df_clean['target'],
test_size=0.2,
random_state=42
)
# 5. Evaluation and Interpretation
from sklearn.metrics import classification_report
# ... model training and evaluationPerformance Tips
# Use vectorized operations with NumPy
import numpy as np
# Slow: Loop through array
arr = np.arange(1000000)
result = 0
for x in arr:
result += x * 2
# Fast: Vectorized operation
result = np.sum(arr * 2) # Much faster!
# Use categorical types in Pandas
df['category'] = df['category'].astype('category') # Memory efficient
# Use chunked processing for large datasets
chunk_size = 10000
for chunk in pd.read_csv('large_file.csv', chunksize=chunk_size):
process_chunk(chunk)External Resources:
Related Tutorials:
Last updated on