Python CSV Handling
CSV (Comma-Separated Values) is a common format for tabular data. Python’s csv module provides tools for reading and writing CSV files, handling various delimiters and edge cases.
CSV Basics
CSV files store tabular data as plain text.
CSV Structure
# Example CSV content:
# name,age,city
# Alice,30,New York
# Bob,25,Chicago
# Charlie,35,Los Angeles
print("CSV files contain rows of comma-separated values")
print("First row usually contains column headers")CSV is simple but has variations in delimiters and quoting. See CSV format overview.
Reading CSV Files
Use csv.reader() to read CSV data.
Basic Reading
import csv
# Sample CSV data
csv_data = """name,age,city
Alice,30,New York
Bob,25,Chicago
Charlie,35,Los Angeles"""
# Read from string
from io import StringIO
csv_file = StringIO(csv_data)
reader = csv.reader(csv_file)
for row in reader:
print(row)
# Output:
# ['name', 'age', 'city']
# ['Alice', '30', 'New York']
# ['Bob', '25', 'Chicago']
# ['Charlie', '35', 'Los Angeles']csv.reader() returns an iterator of lists, one per row.
Reading from Files
import csv
# Read from file
with open("data.csv", "r") as file:
reader = csv.reader(file)
# Skip header if present
header = next(reader)
print(f"Headers: {header}")
# Process data rows
for row in reader:
name, age, city = row
print(f"{name} is {age} years old and lives in {city}")Use with statement for automatic file closing. next() skips header row.
Handling Different Delimiters
import csv
# Tab-separated values (TSV)
tsv_data = "name\tage\tcity\nAlice\t30\tNew York"
from io import StringIO
tsv_file = StringIO(tsv_data)
# Specify tab delimiter
reader = csv.reader(tsv_file, delimiter='\t')
for row in reader:
print(row)
# Semicolon-separated (common in Europe)
semicolon_data = "name;age;city\nAlice;30;New York"
semicolon_file = StringIO(semicolon_data)
reader = csv.reader(semicolon_file, delimiter=';')
for row in reader:
print(row)Use delimiter parameter for non-comma separators.
Writing CSV Files
Use csv.writer() to create CSV files.
Basic Writing
import csv
# Sample data
data = [
["name", "age", "city"],
["Alice", "30", "New York"],
["Bob", "25", "Chicago"],
["Charlie", "35", "Los Angeles"]
]
# Write to file
with open("output.csv", "w", newline="") as file:
writer = csv.writer(file)
writer.writerows(data)
print("CSV file created successfully")
# Verify by reading back
with open("output.csv", "r") as file:
print(file.read())newline="" prevents extra blank lines on Windows. writerows() writes multiple rows.
Writing Row by Row
import csv
# Data to write
employees = [
{"name": "Alice", "department": "Engineering", "salary": 75000},
{"name": "Bob", "department": "Sales", "salary": 65000},
{"name": "Charlie", "department": "Marketing", "salary": 70000}
]
with open("employees.csv", "w", newline="") as file:
# Define fieldnames
fieldnames = ["name", "department", "salary"]
writer = csv.DictWriter(file, fieldnames=fieldnames)
# Write header
writer.writeheader()
# Write data
for employee in employees:
writer.writerow(employee)
print("Employee CSV created")csv.DictWriter writes dictionaries as CSV rows.
DictReader and DictWriter
Handle CSV data as dictionaries for easier access.
DictReader
import csv
# CSV with headers
csv_content = """name,age,city,department
Alice,30,New York,Engineering
Bob,25,Chicago,Sales
Charlie,35,Los Angeles,Marketing"""
from io import StringIO
csv_file = StringIO(csv_content)
reader = csv.DictReader(csv_file)
for row in reader:
print(f"{row['name']} works in {row['department']} and is {row['age']} years old")
# Access all fieldnames
csv_file.seek(0) # Reset file pointer
reader = csv.DictReader(csv_file)
print(f"Fieldnames: {reader.fieldnames}")DictReader returns each row as a dictionary with column names as keys.
DictWriter
import csv
# Data as dictionaries
products = [
{"name": "Laptop", "price": 999.99, "category": "Electronics"},
{"name": "Book", "price": 19.99, "category": "Education"},
{"name": "Chair", "price": 149.99, "category": "Furniture"}
]
with open("products.csv", "w", newline="") as file:
fieldnames = ["name", "price", "category"]
writer = csv.DictWriter(file, fieldnames=fieldnames)
# Write header
writer.writeheader()
# Write rows
writer.writerows(products)
print("Products CSV created")
# Read back and display
with open("products.csv", "r") as file:
reader = csv.DictReader(file)
for product in reader:
print(f"{product['name']}: ${product['price']} ({product['category']})")DictWriter takes dictionaries and writes them as CSV rows.
Handling Edge Cases
CSV files can have tricky data that needs special handling.
Quoted Fields
import csv
# CSV with quoted fields containing commas
csv_data = '''name,description,price
"Widget A","A useful widget, with comma",19.99
"Widget B","Another widget",29.99
"Widget C","Widget with ""quotes""",39.99'''
from io import StringIO
csv_file = StringIO(csv_data)
reader = csv.reader(csv_file)
for row in reader:
print(row)
# Output handles quotes automatically:
# ['name', 'description', 'price']
# ['Widget A', 'A useful widget, with comma', '19.99']
# ['Widget B', 'Another widget', '29.99']
# ['Widget C', 'Widget with "quotes"', '39.99']CSV handles quoted fields with embedded commas and quotes.
Custom Quoting
import csv
# Data with special characters
data = [
["name", "description"],
["Widget", "A widget, with comma"],
["Gadget", 'Gadget with "quotes"'],
["Tool", "Normal description"]
]
# Write with custom quoting
with open("quoted.csv", "w", newline="") as file:
writer = csv.writer(file, quoting=csv.QUOTE_ALL)
writer.writerows(data)
print("CSV with all fields quoted created")
# Different quoting options
with open("minimal.csv", "w", newline="") as file:
writer = csv.writer(file, quoting=csv.QUOTE_MINIMAL) # Default
writer.writerows(data)
print("CSV with minimal quoting created")Control quoting behavior with the quoting parameter.
Data Processing
Common operations when working with CSV data.
Filtering and Transforming
import csv
# Read, filter, and transform data
with open("employees.csv", "r") as infile, open("high_salary.csv", "w", newline="") as outfile:
reader = csv.DictReader(infile)
fieldnames = reader.fieldnames
writer = csv.DictWriter(outfile, fieldnames=fieldnames)
writer.writeheader()
for row in reader:
# Convert salary to float and filter
salary = float(row["salary"])
if salary > 70000:
# Add bonus column
row["bonus"] = str(salary * 0.1)
writer.writerow(row)
print("Filtered and transformed data saved")Process CSV data on-the-fly during reading/writing.
Data Analysis
import csv
from collections import defaultdict
# Analyze employee data
department_count = defaultdict(int)
department_salary = defaultdict(float)
with open("employees.csv", "r") as file:
reader = csv.DictReader(file)
for row in reader:
dept = row["department"]
salary = float(row["salary"])
department_count[dept] += 1
department_salary[dept] += salary
# Print analysis
for dept in department_count:
count = department_count[dept]
avg_salary = department_salary[dept] / count
print(f"{dept}: {count} employees, avg salary ${avg_salary:.2f}")Perform data analysis on CSV content.
Large File Handling
import csv
def process_large_csv(filename, batch_size=1000):
"""Process large CSV files in batches."""
with open(filename, "r") as file:
reader = csv.DictReader(file)
batch = []
for row in reader:
batch.append(row)
if len(batch) >= batch_size:
# Process batch
process_batch(batch)
batch = [] # Clear batch
# Process remaining rows
if batch:
process_batch(batch)
def process_batch(batch):
"""Process a batch of rows."""
print(f"Processing {len(batch)} rows")
# Perform operations on batch
for row in batch:
# Example: validate data
if not row["name"]:
print(f"Warning: Empty name in row")
# Usage
# process_large_csv("large_file.csv")Handle large CSV files without loading everything into memory.
Error Handling
Robust CSV processing with error handling.
Handling Bad Data
import csv
def safe_csv_reader(filename):
"""Read CSV with error handling."""
with open(filename, "r") as file:
reader = csv.reader(file)
for line_num, row in enumerate(reader, 1):
try:
# Validate row has expected number of fields
if len(row) != 3:
print(f"Line {line_num}: Expected 3 fields, got {len(row)}")
continue
name, age, city = row
# Validate age is numeric
age = int(age)
print(f"Processed: {name}, {age}, {city}")
except ValueError as e:
print(f"Line {line_num}: Invalid age '{age}': {e}")
except Exception as e:
print(f"Line {line_num}: Unexpected error: {e}")
# Test with problematic data
csv_content = """name,age,city
Alice,30,New York
Bob,not_a_number,Chicago
Charlie,35
Dave,40,Boston,Extra"""
from io import StringIO
csv_file = StringIO(csv_content)
# Temporarily replace file reading for demo
with open("temp.csv", "w") as f:
f.write(csv_content)
safe_csv_reader("temp.csv")Handle malformed CSV data gracefully.
Advanced Features
Sniffer for Format Detection
import csv
# CSV with unknown format
sample_data = "name;age;city\nAlice;30;New York\nBob;25;Chicago"
from io import StringIO
csv_file = StringIO(sample_data)
# Use Sniffer to detect format
sniffer = csv.Sniffer()
sample = csv_file.read(1024)
csv_file.seek(0)
# Detect delimiter
delimiter = sniffer.sniff(sample).delimiter
print(f"Detected delimiter: '{delimiter}'")
# Read with detected delimiter
reader = csv.reader(csv_file, delimiter=delimiter)
for row in reader:
print(row)csv.Sniffer automatically detects CSV format parameters.
Unicode Handling
import csv
# Data with Unicode characters
data = [
["name", "city"],
["José", "São Paulo"],
["François", "Montréal"],
["Михаил", "Москва"]
]
# Write with UTF-8 encoding
with open("unicode.csv", "w", newline="", encoding="utf-8") as file:
writer = csv.writer(file)
writer.writerows(data)
print("Unicode CSV created")
# Read with proper encoding
with open("unicode.csv", "r", encoding="utf-8") as file:
reader = csv.reader(file)
for row in reader:
print(row)Handle international characters with proper encoding.
Performance Tips
Benchmarking
import csv
import time
# Create test data
data = [["col1", "col2", "col3"]] + [[f"row{i}", i, i*2] for i in range(10000)]
# Test writing performance
start = time.time()
with open("benchmark.csv", "w", newline="") as file:
writer = csv.writer(file)
writer.writerows(data)
write_time = time.time() - start
# Test reading performance
start = time.time()
with open("benchmark.csv", "r") as file:
rows = list(csv.reader(file))
read_time = time.time() - start
print(f"Writing 10k rows: {write_time:.4f} seconds")
print(f"Reading 10k rows: {read_time:.4f} seconds")CSV operations are generally fast, but profile for bottlenecks.
Best Practices
- Use
withstatements for file operations - Specify
newline=""when writing - Use
DictReader/DictWriterfor complex data - Handle encoding explicitly for Unicode data
- Validate data when reading from external sources
- Use appropriate quoting for data with special characters
- Consider pandas for complex data analysis
External Resources:
Related Tutorials: