Python Data Visualization
Data visualization helps you understand and communicate data insights effectively. Python offers powerful libraries for creating compelling visualizations, from simple charts to complex interactive dashboards. This guide will teach you how to create stunning visualizations using Python’s most popular data visualization libraries.
If you’re new to Python, you might want to start with Python basics and review functions first.
Getting Started with Matplotlib
Matplotlib is the foundational data visualization library in Python. It provides a flexible framework for creating static, animated, and interactive visualizations.
Installing Matplotlib
pip install matplotlib numpy pandasBasic Plotting
import matplotlib.pyplot as plt
import numpy as np
# Create sample data
x = np.linspace(0, 10, 100)
y = np.sin(x)
y2 = np.cos(x)
# Create a figure and axis
fig, ax = plt.subplots(figsize=(10, 6))
# Plot data
ax.plot(x, y, label='sin(x)', color='blue', linewidth=2)
ax.plot(x, y2, label='cos(x)', color='red', linewidth=2, linestyle='--')
# Add labels and title
ax.set_xlabel('X values')
ax.set_ylabel('Y values')
ax.set_title('Sine and Cosine Functions')
ax.legend()
ax.grid(True, alpha=0.3)
# Save the plot
plt.savefig('sine_cosine_plot.png', dpi=300, bbox_inches='tight')
plt.show()Different Chart Types
import matplotlib.pyplot as plt
import numpy as np
# Sample data
categories = ['Product A', 'Product B', 'Product C', 'Product D']
values = [25, 30, 15, 35]
errors = [2, 3, 1.5, 4]
# Create subplots
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(12, 10))
# Bar chart with error bars
ax1.bar(categories, values, yerr=errors, capsize=5,
color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A'])
ax1.set_title('Product Sales with Error Bars')
ax1.set_ylabel('Sales (thousands)')
# Pie chart
sizes = [30, 25, 20, 25]
colors = ['#ff9999', '#66b3ff', '#99ff99', '#ffcc99']
explode = (0.1, 0, 0, 0) # explode first slice
ax2.pie(sizes, explode=explode, labels=categories, colors=colors,
autopct='%1.1f%%', shadow=True, startangle=90)
ax2.set_title('Market Share Distribution')
# Scatter plot
np.random.seed(42)
x_scatter = np.random.randn(100)
y_scatter = x_scatter + np.random.randn(100) * 0.5
colors_scatter = np.random.rand(100)
ax3.scatter(x_scatter, y_scatter, c=colors_scatter, alpha=0.6,
cmap='viridis', s=50)
ax3.set_xlabel('X values')
ax3.set_ylabel('Y values')
ax3.set_title('Scatter Plot with Color Mapping')
# Histogram
data = np.random.normal(100, 15, 1000)
ax4.hist(data, bins=30, color='skyblue', edgecolor='black', alpha=0.7)
ax4.set_xlabel('Value')
ax4.set_ylabel('Frequency')
ax4.set_title('Normal Distribution')
plt.tight_layout()
plt.savefig('chart_types.png', dpi=300, bbox_inches='tight')
plt.show()Advanced Matplotlib Techniques
Custom Styling and Themes
import matplotlib.pyplot as plt
import numpy as np
# Set style
plt.style.use('seaborn-v0_8')
# Create figure with custom size
fig, ax = plt.subplots(figsize=(12, 8))
# Generate data
x = np.linspace(0, 4 * np.pi, 1000)
y1 = np.sin(x)
y2 = np.sin(x + np.pi/2)
y3 = np.sin(x + np.pi)
# Create filled area plot
ax.fill_between(x, y1, alpha=0.3, color='blue', label='sin(x)')
ax.fill_between(x, y2, alpha=0.3, color='red', label='sin(x + π/2)')
ax.fill_between(x, y3, alpha=0.3, color='green', label='sin(x + π)')
# Customize plot
ax.set_title('Overlapping Sine Waves', fontsize=16, fontweight='bold')
ax.set_xlabel('X values', fontsize=12)
ax.set_ylabel('Y values', fontsize=12)
ax.legend(loc='upper right', fontsize=10)
ax.grid(True, alpha=0.3)
# Add annotations
ax.annotate('Peak', xy=(np.pi/2, 1), xytext=(np.pi/2 + 0.5, 1.2),
arrowprops=dict(facecolor='black', shrink=0.05),
fontsize=10)
plt.tight_layout()
plt.show()Multiple Subplots and Figure Management
import matplotlib.pyplot as plt
import numpy as np
# Create complex subplot layout
fig = plt.figure(figsize=(16, 10))
fig.suptitle('Data Analysis Dashboard', fontsize=20, fontweight='bold')
# Create gridspec for custom layout
gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)
# Main plot (spans 2x2)
ax_main = fig.add_subplot(gs[:2, :2])
x_main = np.linspace(0, 10, 100)
y_main = np.sin(x_main) * np.exp(-x_main/10)
ax_main.plot(x_main, y_main, 'b-', linewidth=3)
ax_main.set_title('Damped Oscillation')
ax_main.set_xlabel('Time')
ax_main.set_ylabel('Amplitude')
# Side histogram
ax_hist = fig.add_subplot(gs[:2, 2])
data_hist = np.random.normal(0, 1, 1000)
ax_hist.hist(data_hist, bins=30, orientation='horizontal', color='orange', alpha=0.7)
ax_hist.set_title('Distribution')
ax_hist.set_xlabel('Frequency')
# Bottom left scatter
ax_scatter = fig.add_subplot(gs[2, 0])
x_scatter = np.random.randn(50)
y_scatter = x_scatter * 2 + np.random.randn(50)
ax_scatter.scatter(x_scatter, y_scatter, alpha=0.6, c='green')
ax_scatter.set_title('Correlation')
# Bottom center box plot
ax_box = fig.add_subplot(gs[2, 1])
data_box = [np.random.normal(0, std, 100) for std in range(1, 4)]
ax_box.boxplot(data_box, labels=['Group 1', 'Group 2', 'Group 3'])
ax_box.set_title('Box Plot')
# Bottom right pie chart
ax_pie = fig.add_subplot(gs[2, 2))
sizes_pie = [30, 25, 20, 15, 10]
labels_pie = ['A', 'B', 'C', 'D', 'E']
colors_pie = plt.cm.Set3(np.linspace(0, 1, len(sizes_pie)))
ax_pie.pie(sizes_pie, labels=labels_pie, colors=colors_pie, autopct='%1.0f%%')
ax_pie.set_title('Proportions')
plt.tight_layout()
plt.show()Working with Seaborn
Seaborn builds on matplotlib and provides a high-level interface for drawing attractive statistical graphics.
Installing Seaborn
pip install seabornStatistical Visualizations
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# Create sample data
np.random.seed(42)
data = {
'category': np.random.choice(['A', 'B', 'C', 'D'], 200),
'value': np.random.normal(50, 15, 200),
'group': np.random.choice(['Group 1', 'Group 2'], 200)
}
df = pd.DataFrame(data)
# Set style
sns.set_style("whitegrid")
plt.figure(figsize=(15, 10))
# Create subplots
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
# Violin plot
sns.violinplot(data=df, x='category', y='value', hue='group',
split=True, ax=axes[0, 0])
axes[0, 0].set_title('Violin Plot')
# Box plot with swarm
sns.boxplot(data=df, x='category', y='value', ax=axes[0, 1])
sns.swarmplot(data=df, x='category', y='value', color='black',
alpha=0.5, ax=axes[0, 1])
axes[0, 1].set_title('Box Plot with Swarm')
# Heatmap
correlation_data = np.random.rand(5, 5)
sns.heatmap(correlation_data, annot=True, cmap='coolwarm',
center=0, ax=axes[0, 2])
axes[0, 2].set_title('Correlation Heatmap')
# Count plot
sns.countplot(data=df, x='category', hue='group', ax=axes[1, 0])
axes[1, 0].set_title('Count Plot')
# Distribution plot
sns.histplot(data=df, x='value', kde=True, hue='group',
ax=axes[1, 1])
axes[1, 1].set_title('Distribution Plot')
# Pair plot (simplified for single subplot)
from sklearn.datasets import load_iris
iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['species'] = iris.target
# Create a smaller pair plot for demonstration
subset_features = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)']
sns.scatterplot(data=iris_df, x='sepal length (cm)', y='petal length (cm)',
hue='species', ax=axes[1, 2])
axes[1, 2].set_title('Iris Species Scatter Plot')
plt.tight_layout()
plt.savefig('seaborn_gallery.png', dpi=300, bbox_inches='tight')
plt.show()Interactive Visualizations with Plotly
Plotly creates interactive, publication-quality graphs. Users can hover, zoom, and pan your plots.
Installing Plotly
pip install plotly pandasInteractive Charts
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import numpy as np
# Create sample data
np.random.seed(42)
dates = pd.date_range('2023-01-01', periods=100, freq='D')
values = np.cumsum(np.random.randn(100)) + 100
# Interactive line chart
fig_line = go.Figure()
fig_line.add_trace(go.Scatter(
x=dates,
y=values,
mode='lines+markers',
name='Stock Price',
line=dict(color='blue', width=2),
marker=dict(size=4)
))
fig_line.update_layout(
title='Interactive Stock Price Chart',
xaxis_title='Date',
yaxis_title='Price ($)',
hovermode='x unified',
showlegend=True
)
fig_line.show()
# Interactive 3D scatter plot
np.random.seed(42)
n_points = 200
x_3d = np.random.randn(n_points)
y_3d = np.random.randn(n_points)
z_3d = np.random.randn(n_points)
colors = np.random.rand(n_points)
fig_3d = go.Figure(data=[go.Scatter3d(
x=x_3d,
y=y_3d,
z=z_3d,
mode='markers',
marker=dict(
size=5,
color=colors,
colorscale='Viridis',
opacity=0.8
)
)])
fig_3d.update_layout(
title='3D Scatter Plot',
scene=dict(
xaxis_title='X Axis',
yaxis_title='Y Axis',
zaxis_title='Z Axis'
)
)
fig_3d.show()
# Interactive heatmap
corr_matrix = np.random.rand(10, 10)
corr_matrix = (corr_matrix + corr_matrix.T) / 2 # Make symmetric
fig_heatmap = go.Figure(data=go.Heatmap(
z=corr_matrix,
colorscale='RdBu',
zmid=0,
text=np.round(corr_matrix, 2),
texttemplate='%{text}',
textfont={"size": 10},
hoverongaps=False
))
fig_heatmap.update_layout(
title='Interactive Correlation Heatmap',
xaxis_title='Features',
yaxis_title='Features'
)
fig_heatmap.show()Real-World Data Visualization Examples
Sales Dashboard
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
# Create realistic sales data
np.random.seed(42)
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
products = ['Laptops', 'Smartphones', 'Tablets', 'Headphones']
# Generate sales data
sales_data = []
for month in months:
for product in products:
base_sales = {'Laptops': 50, 'Smartphones': 100, 'Tablets': 30, 'Headphones': 80}
sales = base_sales[product] + np.random.normal(0, base_sales[product] * 0.2)
sales_data.append({'Month': month, 'Product': product, 'Sales': max(0, sales)})
df_sales = pd.DataFrame(sales_data)
# Create comprehensive dashboard
fig = plt.figure(figsize=(20, 12))
fig.suptitle('Annual Sales Dashboard', fontsize=24, fontweight='bold')
# Define grid layout
gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)
# 1. Monthly trend line plot
ax1 = fig.add_subplot(gs[0, :])
monthly_total = df_sales.groupby('Month')['Sales'].sum().reindex(months)
ax1.plot(months, monthly_total, marker='o', linewidth=3, markersize=8,
color='#2E86AB', markerfacecolor='#A23B72')
ax1.set_title('Monthly Sales Trend', fontsize=16)
ax1.set_ylabel('Total Sales')
ax1.grid(True, alpha=0.3)
ax1.tick_params(axis='x', rotation=45)
# 2. Product comparison bar chart
ax2 = fig.add_subplot(gs[1, 0])
product_totals = df_sales.groupby('Product')['Sales'].sum()
bars = ax2.bar(product_totals.index, product_totals.values,
color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A'])
ax2.set_title('Total Sales by Product', fontsize=14)
ax2.set_ylabel('Sales')
ax2.tick_params(axis='x', rotation=45)
# Add value labels on bars
for bar in bars:
height = bar.get_height()
ax2.text(bar.get_x() + bar.get_width()/2., height,
f'{int(height)}', ha='center', va='bottom')
# 3. Pie chart for market share
ax3 = fig.add_subplot(gs[1, 1])
colors = plt.cm.Set3(np.linspace(0, 1, len(product_totals)))
wedges, texts, autotexts = ax3.pie(product_totals.values, labels=product_totals.index,
autopct='%1.1f%%', colors=colors, startangle=90)
ax3.set_title('Market Share', fontsize=14)
# 4. Box plot for sales distribution
ax4 = fig.add_subplot(gs[1, 2])
sns.boxplot(data=df_sales, x='Product', y='Sales', ax=ax4)
ax4.set_title('Sales Distribution', fontsize=14)
ax4.tick_params(axis='x', rotation=45)
# 5. Heatmap of monthly product performance
ax5 = fig.add_subplot(gs[2, :])
pivot_table = df_sales.pivot_table(values='Sales', index='Product',
columns='Month', aggfunc='sum')
sns.heatmap(pivot_table, annot=True, fmt='.0f', cmap='YlOrRd',
ax=ax5, cbar_kws={'label': 'Sales'})
ax5.set_title('Monthly Product Performance Heatmap', fontsize=14)
plt.tight_layout()
plt.savefig('sales_dashboard.png', dpi=300, bbox_inches='tight')
plt.show()Data Visualization Best Practices
Choosing the Right Chart Type
"""
Guidelines for choosing charts:
Line Chart: Show trends over time
Bar Chart: Compare quantities across categories
Scatter Plot: Show relationship between two variables
Histogram: Show distribution of a single variable
Box Plot: Show distribution summary with outliers
Heatmap: Show correlation or intensity across two dimensions
Pie Chart: Show parts of a whole (limited categories)
"""
# Example: Choosing appropriate visualization for different data scenarios
import matplotlib.pyplot as plt
import numpy as np
# Scenario 1: Time series data - Use line chart
time_data = np.cumsum(np.random.randn(365)) + 100
plt.figure(figsize=(12, 4))
plt.plot(time_data)
plt.title('Daily Stock Prices - Line Chart Appropriate')
plt.xlabel('Days')
plt.ylabel('Price')
plt.show()
# Scenario 2: Categorical comparison - Use bar chart
categories = ['Product A', 'Product B', 'Product C', 'Product D']
values = [45, 78, 32, 56]
plt.figure(figsize=(8, 4))
plt.bar(categories, values)
plt.title('Sales by Product - Bar Chart Appropriate')
plt.ylabel('Sales')
plt.show()
# Scenario 3: Correlation - Use scatter plot
x = np.random.randn(100)
y = 2 * x + np.random.randn(100) * 0.5
plt.figure(figsize=(6, 6))
plt.scatter(x, y, alpha=0.6)
plt.title('Height vs Weight - Scatter Plot Appropriate')
plt.xlabel('Height')
plt.ylabel('Weight')
plt.show()Design Principles
import matplotlib.pyplot as plt
import seaborn as sns
# Apply good design principles
plt.style.use('seaborn-v0_8-whitegrid')
# Example of well-designed chart
fig, ax = plt.subplots(figsize=(10, 6))
# Data
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun']
sales = [65, 72, 78, 85, 92, 88]
# Create bar chart with good design
bars = ax.bar(months, sales, color='#4ECDC4', edgecolor='#2E86AB', linewidth=2)
# Add value labels
for bar in bars:
height = bar.get_height()
ax.text(bar.get_x() + bar.get_width()/2., height + 1,
f'{height}', ha='center', va='bottom', fontweight='bold')
# Design improvements
ax.set_title('Monthly Sales Performance', fontsize=16, fontweight='bold', pad=20)
ax.set_xlabel('Month', fontsize=12, labelpad=10)
ax.set_ylabel('Sales (thousands)', fontsize=12, labelpad=10)
# Remove clutter
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
# Add meaningful data markers
ax.axhline(y=sum(sales)/len(sales), color='red', linestyle='--', alpha=0.7, label='Average')
ax.legend()
plt.tight_layout()
plt.show()Saving and Exporting Visualizations
import matplotlib.pyplot as plt
import numpy as np
# Create a sample plot
x = np.linspace(0, 10, 100)
y = np.sin(x)
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(x, y, linewidth=2)
ax.set_title('Sample Plot for Export')
ax.set_xlabel('X')
ax.set_ylabel('sin(X)')
# Save in different formats
formats = [
('plot.png', 'PNG - Good for web'),
('plot.jpg', 'JPEG - Smaller file size'),
('plot.pdf', 'PDF - Vector graphics for publications'),
('plot.svg', 'SVG - Scalable vector graphics'),
('plot.eps', 'EPS - Encapsulated PostScript')
]
for filename, description in formats:
plt.savefig(filename, dpi=300, bbox_inches='tight',
facecolor='white', edgecolor='none')
print(f"Saved {filename} - {description}")
plt.show()Common Pitfalls and How to Avoid Them
import matplotlib.pyplot as plt
import numpy as np
# Pitfall 1: Inappropriate axis scales
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
# Bad: Truncated y-axis
x = np.arange(5)
y = [100, 101, 102, 103, 104]
ax1.bar(x, y)
ax1.set_title('Bad: Truncated Y-Axis (starts at 95)')
ax1.set_ylim(95, 105) # Misleading!
# Good: Start at zero
ax2.bar(x, y)
ax2.set_title('Good: Y-Axis starts at 0')
ax2.set_ylim(0, 110)
plt.tight_layout()
plt.show()
# Pitfall 2: Too many colors/poor color choice
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
# Bad: Too many bright colors
categories = [f'Category {i}' for i in range(15)]
values = np.random.randint(1, 50, 15)
colors_bad = ['red', 'blue', 'green', 'yellow', 'purple', 'orange',
'pink', 'brown', 'gray', 'cyan', 'magenta', 'lime',
'navy', 'teal', 'maroon']
ax1.bar(categories, values, color=colors_bad)
ax1.set_title('Bad: Too Many Colors')
ax1.tick_params(axis='x', rotation=90)
# Good: Sequential color palette
colors_good = plt.cm.viridis(np.linspace(0, 1, len(categories)))
ax2.bar(categories, values, color=colors_good)
ax2.set_title('Good: Sequential Color Palette')
ax2.tick_params(axis='x', rotation=90)
plt.tight_layout()
plt.show()Resources and Further Learning
Official Documentation:
Data Visualization Best Practices:
- Use appropriate chart types for your data
- Keep designs simple and clear
- Use color thoughtfully and consider colorblind users
- Always label axes and provide context
- Test visualizations with your target audience
This tutorial covers the fundamentals of data visualization in Python. Practice with real datasets to build your skills and explore the extensive documentation available for each library to discover more advanced features.
For more Python tutorials, check out our guides on web development with Flask and data science with Pandas and NumPy.