Python Data Science with Pandas & NumPy

Python Data Science with Pandas & NumPy

Python is the undisputed champion of data science. With powerful libraries like Pandas and NumPy, you can analyze, manipulate, and visualize data like a professional data scientist.

What is Data Science?

Data science combines statistics, programming, and domain expertise to extract insights from data. Python’s ecosystem makes it the perfect language for data science workflows.

Why Python for Data Science?

  • Rich Libraries: NumPy, Pandas, Matplotlib, Scikit-learn
  • Easy Syntax: Readable code for complex operations
  • Community Support: Extensive documentation and tutorials
  • Integration Ready: Works with Jupyter notebooks
  • Industry Standard: Used by data scientists worldwide

NumPy Fundamentals

Installation

pip install numpy

NumPy Arrays

import numpy as np

# Create arrays
arr = np.array([1, 2, 3, 4, 5])
matrix = np.array([[1, 2], [3, 4]])

print(arr)        # [1 2 3 4 5]
print(matrix)     # [[1 2] [3 4]]
print(arr.dtype)  # int64
print(arr.shape)  # (5,) - 1D array
print(matrix.shape) # (2, 2) - 2D array

Array Operations

import numpy as np

arr = np.array([10, 20, 30, 40, 50])

# Basic arithmetic
print(arr + 5)      # [15 25 35 45 55]
print(arr * 2)      # [20 40 60 80 100]
print(arr - 5)      # [5 15 25 35 45]

# Statistical operations
print(np.mean(arr))   # 30.0
print(np.median(arr)) # 30.0
print(np.std(arr))    # 14.14213562
print(np.sum(arr))    # 150
print(np.max(arr))    # 50
print(np.min(arr))    # 10

Array Creation Methods

import numpy as np

# Different ways to create arrays
zeros = np.zeros(5)           # [0. 0. 0. 0. 0.]
ones = np.ones(3)             # [1. 1. 1.]
full = np.full(4, 7)          # [7. 7. 7. 7.]
range_arr = np.arange(0, 10, 2) # [0 2 4 6 8]
linspace = np.linspace(0, 1, 5)  # [0. 0.25 0.5 0.75 1.]
random_arr = np.random.rand(3)    # Random values between 0-1

# Array reshaping
arr = np.arange(12)
reshaped = arr.reshape(3, 4)     # 3x4 matrix
reshaped_back = reshaped.reshape(-1) # Back to 1D array (12 elements)

Mathematical Functions

import numpy as np

# Trigonometric
angles = np.array([0, np.pi/4, np.pi/2])
sin_vals = np.sin(angles)
cos_vals = np.cos(angles)
tan_vals = np.tan(angles)

# Exponential and logarithmic
exp_vals = np.exp([1, 2, 3])    # [e^1, e^2, e^3]
log_vals = np.log([1, np.e, 10])   # [0, 1, 2.302585]

# Linear algebra operations
A = np.array([[1, 2], [3, 4]])
B = np.array([[5, 6], [7, 8]])

C = A + B      # Element-wise addition
D = A * B      # Matrix multiplication
E = np.dot(A, B) # Dot product
F = np.transpose(A) # Transpose matrix

print("Matrix addition:\n", C)
print("Matrix multiplication:\n", D)
print("Dot product:\n", E)

Pandas Fundamentals

Installation

pip install pandas

Pandas Series

import pandas as pd
import numpy as np

# Create Series from list
data = [10, 20, 30, 40, 50]
series = pd.Series(data, name='numbers')

print(series)
# 0    10
# 1    20
# 2    30
# 3    40
# 4    50
# Name: numbers, dtype: int64

# Create Series with custom index
indices = ['a', 'b', 'c', 'd', 'e']
series_with_index = pd.Series([100, 200, 300, 400, 500], index=indices)

print(series_with_index)
# a    100
# b    200
# c    300
# d    400
# e    500

Pandas DataFrame

import pandas as pd
import numpy as np

# Create DataFrame from dictionary
data = {
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'age': [25, 30, 35, 40, 28],
    'city': ['New York', 'Los Angeles', 'Chicago', 'Boston', 'Seattle'],
    'salary': [50000, 60000, 70000, 80000, 55000]
}

df = pd.DataFrame(data)

print(df)
#      name      age  city         salary
# 0  Alice      25   New York    50000
# 1    Bob      30  Los Angeles  60000
# 2  Charlie    35   Chicago     70000
# 3    David      40   Boston       80000
# 4    Eva       28   Seattle     55000

print(df.dtypes)
# name       object
# age        int64
# city       object
# salary     int64

print(df.shape)   # (5, 4) - 5 rows, 4 columns

Data Import and Export

import pandas as pd

# Read CSV file
df = pd.read_csv('data.csv')
print(df.head())

# Read Excel file
df_excel = pd.read_excel('data.xlsx')
print(df_excel.head())

# Read JSON file
df_json = pd.read_json('data.json')
print(df_json.head())

# Export to CSV
df.to_csv('output.csv', index=False)

# Export to Excel
df.to_excel('output.xlsx', index=False)

# Export to JSON
df.to_json('output.json', orient='records')

Data Analysis with Pandas

Basic Data Exploration

import pandas as pd
import numpy as np

# Load sample data
data = {
    'product': ['Laptop', 'Mouse', 'Keyboard', 'Monitor'],
    'price': [999, 29, 79, 299],
    'category': ['Electronics', 'Electronics', 'Electronics', 'Electronics'],
    'rating': [4.5, 4.2, 4.7, 4.8],
    'stock': [50, 200, 150, 75]
}

df = pd.DataFrame(data)

# Basic information
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
print("Data types:\n", df.dtypes)
print("Basic statistics:\n", df.describe())

# Check for missing values
print("Missing values per column:\n", df.isnull().sum())

# Memory usage
print("Memory usage:\n", df.info())

Data Selection and Filtering

import pandas as pd

# Create sample DataFrame
df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'age': [25, 30, 35, 40, 28],
    'department': ['IT', 'HR', 'IT', 'Finance', 'IT'],
    'salary': [50000, 60000, 70000, 80000, 55000],
    'city': ['NYC', 'LA', 'Chicago', 'Boston', 'Seattle']
})

# Select columns
names = df['name']
ages = df[['name', 'age']]
all_columns = df.loc[:, ['name', 'age', 'city']]

# Select rows by index
first_three = df.loc[0:3]
last_three = df.loc[-3:]

# Conditional selection
high_earners = df[df['salary'] > 60000]
it_employees = df[df['department'] == 'IT']
young_it_employees = df[(df['department'] == 'IT') & (df['age'] < 35)]

print("High earners (>60k):\n", high_earners)
print("IT department employees:\n", it_employees)
print("Young IT employees (<35):\n", young_it_employees)

Data Aggregation

import pandas as pd

df = pd.DataFrame({
    'department': ['IT', 'HR', 'IT', 'Finance', 'IT'],
    'salary': [50000, 60000, 70000, 80000, 55000],
    'experience': [2, 5, 7, 10, 3]
})

# Group by department
grouped = df.groupby('department')

# Calculate statistics for each group
dept_stats = grouped.agg({
    'count': ('salary', 'count'),
    'mean_salary': ('salary', 'mean'),
    'max_salary': ('salary', 'max'),
    'min_salary': ('salary', 'min'),
    'avg_experience': ('experience', 'mean')
})

print("Department statistics:\n", dept_stats)

# Multiple aggregations
complex_stats = df.groupby('department').agg({
    'total_salary': ('salary', 'sum'),
    'employee_count': ('salary', 'count'),
    'salary_range': ('salary', lambda x: x.max() - x.min())
})

print("Complex statistics:\n", complex_stats)

Data Cleaning with Pandas

import pandas as pd
import numpy as np

# Create DataFrame with missing values and duplicates
dirty_data = pd.DataFrame({
    'name': ['Alice', 'Bob', None, 'David', 'Alice'],
    'age': [25, 30, 35, None, 28],
    'salary': [50000, None, 70000, 80000, 55000],
    'email': ['alice@. com', '[email protected]', None, '[email protected]', None]
})

df = pd.DataFrame(dirty_data)

# Handle missing values
print("Original missing values:\n", df.isnull().sum())

# Drop rows with any missing values
df_clean = df.dropna()
print("After dropping rows:\n", df_clean.isnull().sum())

# Fill missing values with specific values
df_filled = df.fillna({
    'name': 'Unknown',
    'age': df['age'].median(),
    'salary': df['salary'].mean(),
    'email': '[email protected]'
})

# Remove duplicates
df_no_duplicates = df.drop_duplicates()
print("After removing duplicates:\n", len(df_no_duplicates))

# Data type conversion
df['age'] = pd.to_numeric(df['age'], errors='coerce')
df['name'] = df['name'].astype('string')

print("Final cleaned data info:\n", df.info())

Data Visualization

Basic Plotting with Matplotlib

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Generate sample data
np.random.seed(42)
data = {
    'month': np.arange(1, 13),
    'sales': np.random.randint(50, 200, 12),
    'profit': np.random.randint(-20, 80, 12)
}

df = pd.DataFrame(data)

# Create subplots
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))

# Line plot
ax1.plot(df['month'], df['sales'], marker='o', linewidth=2, markersize=8)
ax1.set_title('Monthly Sales')
ax1.set_xlabel('Month')
ax1.set_ylabel('Sales')
ax1.grid(True, alpha=0.3)

# Bar chart
ax2.bar(df['month'], df['profit'], color=['red' if x < 0 else 'green' for x in df['profit']])
ax2.set_title('Monthly Profit')
ax2.set_xlabel('Month')
ax2.set_ylabel('Profit')
ax2.axhline(y=0, color='black', linestyle='--', alpha=0.5)

# Scatter plot
ax3.scatter(df['sales'], df['profit'], alpha=0.6, s=df['profit']*2)
ax3.set_title('Sales vs Profit')
ax3.set_xlabel('Sales')
ax3.set_ylabel('Profit')

# Adjust layout and save
plt.tight_layout()
plt.savefig('business_metrics.png', dpi=300, bbox_inches='tight')
plt.show()

Advanced Visualizations

import seaborn as sns
import pandas as pd
import numpy as np

# Load sample dataset
df = sns.load_dataset('tips')

# Create a figure with multiple subplots
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Scatter plot with regression line
sns.scatterplot(data=df, x='total_bill', y='tip', ax=axes[0,0])
axes[0,0].set_title('Bill Amount vs Tip')

# Box plot for different days
sns.boxplot(data=df, x='day', y='total_bill', ax=axes[0,1])
axes[0,1].set_title('Bill Amount by Day')

# Violin plot for time vs total bill
sns.violinplot(data=df, x='time', y='total_bill', ax=axes[1,0])
axes[1,0].set_title('Bill Distribution by Time')

# Histogram of tips
sns.histplot(data=df, x='tip', ax=axes[1,1])
axes[1,1].set_title('Tip Distribution')

plt.tight_layout()
plt.savefig('restaurant_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

Statistical Analysis

Descriptive Statistics

import pandas as pd
import numpy as np
from scipy import stats

# Generate sample data
np.random.seed(42)
data = {
    'product': ['A', 'B', 'C', 'D', 'E'],
    'sales': np.random.normal(100, 20, 100),
    'marketing_spend': np.random.normal(50, 10, 100),
    'customer_satisfaction': np.random.uniform(1, 5, 100)
}

df = pd.DataFrame(data)

# Basic statistics
print("Mean sales:", df['sales'].mean())
print("Median sales:", df['sales'].median())
print("Sales std:", df['sales'].std())
print("Sales range:", df['sales'].max() - df['sales'].min())

# Correlation
correlation_matrix = df.corr()
print("Correlation matrix:\n", correlation_matrix)

# Correlation coefficient
correlation = df['sales'].corr(df['marketing_spend'])
print("Sales vs Marketing spend correlation:", correlation)

# Statistical test
alpha = 0.05
t_stat, p_value = stats.ttest_ind(
    df['sales'][:50],  # First 50 products
    df['sales'][50:],    # Last 50 products
)

print(f"T-statistic: {t_stat:.4f}")
print(f"P-value: {p_value:.4f}")
print("Statistically significant:", p_value < alpha)

Machine Learning with Scikit-learn

Linear Regression

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# Generate sample data
np.random.seed(42)
n_samples = 100
X = np.random.rand(n_samples, 1) * 10  # Feature: marketing spend
y = 2 * X + np.random.randn(n_samples, 1) * 2  # Target: sales

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Create and train model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.2f}")

print(f"Coefficient: {model.coef_[0]:.2f}")
print(f"Intercept: {model.intercept_:.2f}")

# Visualization
plt.figure(figsize=(10, 6))
plt.scatter(X_test, y_test, alpha=0.6, label='Actual')
plt.plot(X_test, y_pred, color='red', linewidth=2, label='Predicted')
plt.xlabel('Marketing Spend')
plt.ylabel('Sales')
plt.title('Linear Regression: Marketing Spend vs Sales')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

Classification Example

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# Sample data
data = {
    'study_hours': [2, 5, 1, 6, 3, 7, 4, 8, 2],
    'previous_score': [65, 75, 60, 80, 70, 85, 72, 90, 68],
    'passed': [0, 1, 0, 1, 0, 1, 1, 0, 1]  # 0=Fail, 1=Pass
}

df = pd.DataFrame(data)

# Prepare features and target
X = df[['study_hours', 'previous_score']]
y = df['passed']

# Split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)

# Make predictions
y_pred = rf.predict(X_test_scaled)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Practical Data Science Projects

Sales Analysis Dashboard

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load and prepare data
df = pd.read_csv('sales_data.csv')
df['date'] = pd.to_datetime(df['date'])
df['month'] = df['date'].dt.month
df['quarter'] = df['date'].dt.quarter

# Monthly analysis
monthly_sales = df.groupby('month').agg({
    'total_sales': ('amount', 'sum'),
    'order_count': ('order_id', 'count'),
    'avg_order_value': ('amount', 'mean')
})

# Visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Monthly sales trend
axes[0,0].plot(monthly_sales.index, monthly_sales['total_sales'], marker='o')
axes[0,0].set_title('Monthly Sales Trend')
axes[0,0].set_xlabel('Month')
axes[0,0].set_ylabel('Total Sales')
axes[0,0].grid(True)

# Quarterly sales comparison
quarterly_sales = df.groupby('quarter')['amount'].sum()
axes[0,1].bar(quarterly_sales.index, quarterly_sales.values)
axes[0,1].set_title('Quarterly Sales')
axes[0,1].set_xlabel('Quarter')
axes[0,1].set_ylabel('Total Sales')

# Product category performance
category_sales = df.groupby('category').agg({
    'total_revenue': ('amount', 'sum'),
    'order_count': ('order_id', 'count')
}).sort_values('total_revenue', ascending=False)

top_5_products = category_sales.head(5)
axes[1,0].barh(top_5_products.index, top_5_products['total_revenue'])
axes[1,0].set_title('Top 5 Products by Revenue')
axes[1,0].set_xlabel('Total Revenue')

plt.tight_layout()
plt.savefig('sales_dashboard.png', dpi=300, bbox_inches='tight')
plt.show()

Customer Segmentation

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Generate customer data
np.random.seed(42)
n_customers = 200
customer_data = {
    'customer_id': range(1, n_customers + 1),
    'annual_income': np.random.lognormal(10.5, 1, n_customers),
    'age': np.random.randint(18, 70, n_customers),
    'spending_score': np.random.uniform(0, 100, n_customers),
    'frequency': np.random.randint(1, 50, n_customers)
}

df = pd.DataFrame(customer_data)

# Select features for clustering
features = ['annual_income', 'age', 'spending_score', 'frequency']
X = df[features]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Perform clustering
kmeans = KMeans(n_clusters=4, random_state=42)
cluster_labels = kmeans.fit_predict(X_scaled)

# Add cluster labels to dataframe
df['cluster'] = cluster_labels

# Analyze clusters
cluster_analysis = df.groupby('cluster').agg({
    'customer_count': ('customer_id', 'count'),
    'avg_income': ('annual_income', 'mean'),
    'avg_age': ('age', 'mean'),
    'avg_spending': ('spending_score', 'mean'),
    'avg_frequency': ('frequency', 'mean')
})

print("Customer segments:")
for cluster_id, analysis in cluster_analysis.iterrows():
    print(f"Cluster {cluster_id}: {analysis['customer_count']} customers, "
          f"Avg Income: ${analysis['avg_income']:.0f}, "
          f"Avg Age: {analysis['avg_age']:.0f}, "
          f"Avg Spending: {analysis['avg_spending']:.0f}")

# Visualize clusters
plt.figure(figsize=(10, 6))
for i in range(4):
    cluster_data = df[df['cluster'] == i]
    plt.scatter(cluster_data['annual_income'], cluster_data['age'], 
                alpha=0.6, s=50, label=f'Cluster {i}')

plt.xlabel('Annual Income')
plt.ylabel('Age')
plt.title('Customer Segmentation')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

Best Practices

Data Science Workflow

# 1. Data Understanding
import pandas as pd
df = pd.read_csv('data.csv')
print(df.info())
print(df.describe())
print(df.head())

# 2. Data Cleaning
print(df.isnull().sum())
df_clean = df.dropna()
df_clean = df_clean.drop_duplicates()

# 3. Feature Engineering
df_clean['feature1'] = df_clean['col1'] / df_clean['col2']
df_clean['category_encoded'] = pd.get_dummies(df_clean['category'])

# 4. Model Building
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df_clean.drop('target', axis=1), 
    df_clean['target'], 
    test_size=0.2, 
    random_state=42
)

# 5. Evaluation and Interpretation
from sklearn.metrics import classification_report
# ... model training and evaluation

Performance Tips

# Use vectorized operations with NumPy
import numpy as np

# Slow: Loop through array
arr = np.arange(1000000)
result = 0
for x in arr:
    result += x * 2

# Fast: Vectorized operation
result = np.sum(arr * 2)  # Much faster!

# Use categorical types in Pandas
df['category'] = df['category'].astype('category')  # Memory efficient

# Use chunked processing for large datasets
chunk_size = 10000
for chunk in pd.read_csv('large_file.csv', chunksize=chunk_size):
    process_chunk(chunk)

External Resources:

Related Tutorials:

Last updated on