Python CSV Handling

Python CSV Handling

CSV (Comma-Separated Values) is a common format for tabular data. Python’s csv module provides tools for reading and writing CSV files, handling various delimiters and edge cases.

CSV Basics

CSV files store tabular data as plain text.

CSV Structure

# Example CSV content:
# name,age,city
# Alice,30,New York
# Bob,25,Chicago
# Charlie,35,Los Angeles

print("CSV files contain rows of comma-separated values")
print("First row usually contains column headers")

CSV is simple but has variations in delimiters and quoting. See CSV format overview.

Reading CSV Files

Use csv.reader() to read CSV data.

Basic Reading

import csv

# Sample CSV data
csv_data = """name,age,city
Alice,30,New York
Bob,25,Chicago
Charlie,35,Los Angeles"""

# Read from string
from io import StringIO
csv_file = StringIO(csv_data)

reader = csv.reader(csv_file)
for row in reader:
    print(row)

# Output:
# ['name', 'age', 'city']
# ['Alice', '30', 'New York']
# ['Bob', '25', 'Chicago']
# ['Charlie', '35', 'Los Angeles']

csv.reader() returns an iterator of lists, one per row.

Reading from Files

import csv

# Read from file
with open("data.csv", "r") as file:
    reader = csv.reader(file)
    
    # Skip header if present
    header = next(reader)
    print(f"Headers: {header}")
    
    # Process data rows
    for row in reader:
        name, age, city = row
        print(f"{name} is {age} years old and lives in {city}")

Use with statement for automatic file closing. next() skips header row.

Handling Different Delimiters

import csv

# Tab-separated values (TSV)
tsv_data = "name\tage\tcity\nAlice\t30\tNew York"

from io import StringIO
tsv_file = StringIO(tsv_data)

# Specify tab delimiter
reader = csv.reader(tsv_file, delimiter='\t')
for row in reader:
    print(row)

# Semicolon-separated (common in Europe)
semicolon_data = "name;age;city\nAlice;30;New York"
semicolon_file = StringIO(semicolon_data)

reader = csv.reader(semicolon_file, delimiter=';')
for row in reader:
    print(row)

Use delimiter parameter for non-comma separators.

Writing CSV Files

Use csv.writer() to create CSV files.

Basic Writing

import csv

# Sample data
data = [
    ["name", "age", "city"],
    ["Alice", "30", "New York"],
    ["Bob", "25", "Chicago"],
    ["Charlie", "35", "Los Angeles"]
]

# Write to file
with open("output.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerows(data)

print("CSV file created successfully")

# Verify by reading back
with open("output.csv", "r") as file:
    print(file.read())

newline="" prevents extra blank lines on Windows. writerows() writes multiple rows.

Writing Row by Row

import csv

# Data to write
employees = [
    {"name": "Alice", "department": "Engineering", "salary": 75000},
    {"name": "Bob", "department": "Sales", "salary": 65000},
    {"name": "Charlie", "department": "Marketing", "salary": 70000}
]

with open("employees.csv", "w", newline="") as file:
    # Define fieldnames
    fieldnames = ["name", "department", "salary"]
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    
    # Write header
    writer.writeheader()
    
    # Write data
    for employee in employees:
        writer.writerow(employee)

print("Employee CSV created")

csv.DictWriter writes dictionaries as CSV rows.

DictReader and DictWriter

Handle CSV data as dictionaries for easier access.

DictReader

import csv

# CSV with headers
csv_content = """name,age,city,department
Alice,30,New York,Engineering
Bob,25,Chicago,Sales
Charlie,35,Los Angeles,Marketing"""

from io import StringIO
csv_file = StringIO(csv_content)

reader = csv.DictReader(csv_file)

for row in reader:
    print(f"{row['name']} works in {row['department']} and is {row['age']} years old")

# Access all fieldnames
csv_file.seek(0)  # Reset file pointer
reader = csv.DictReader(csv_file)
print(f"Fieldnames: {reader.fieldnames}")

DictReader returns each row as a dictionary with column names as keys.

DictWriter

import csv

# Data as dictionaries
products = [
    {"name": "Laptop", "price": 999.99, "category": "Electronics"},
    {"name": "Book", "price": 19.99, "category": "Education"},
    {"name": "Chair", "price": 149.99, "category": "Furniture"}
]

with open("products.csv", "w", newline="") as file:
    fieldnames = ["name", "price", "category"]
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    
    # Write header
    writer.writeheader()
    
    # Write rows
    writer.writerows(products)

print("Products CSV created")

# Read back and display
with open("products.csv", "r") as file:
    reader = csv.DictReader(file)
    for product in reader:
        print(f"{product['name']}: ${product['price']} ({product['category']})")

DictWriter takes dictionaries and writes them as CSV rows.

Handling Edge Cases

CSV files can have tricky data that needs special handling.

Quoted Fields

import csv

# CSV with quoted fields containing commas
csv_data = '''name,description,price
"Widget A","A useful widget, with comma",19.99
"Widget B","Another widget",29.99
"Widget C","Widget with ""quotes""",39.99'''

from io import StringIO
csv_file = StringIO(csv_data)

reader = csv.reader(csv_file)
for row in reader:
    print(row)

# Output handles quotes automatically:
# ['name', 'description', 'price']
# ['Widget A', 'A useful widget, with comma', '19.99']
# ['Widget B', 'Another widget', '29.99']
# ['Widget C', 'Widget with "quotes"', '39.99']

CSV handles quoted fields with embedded commas and quotes.

Custom Quoting

import csv

# Data with special characters
data = [
    ["name", "description"],
    ["Widget", "A widget, with comma"],
    ["Gadget", 'Gadget with "quotes"'],
    ["Tool", "Normal description"]
]

# Write with custom quoting
with open("quoted.csv", "w", newline="") as file:
    writer = csv.writer(file, quoting=csv.QUOTE_ALL)
    writer.writerows(data)

print("CSV with all fields quoted created")

# Different quoting options
with open("minimal.csv", "w", newline="") as file:
    writer = csv.writer(file, quoting=csv.QUOTE_MINIMAL)  # Default
    writer.writerows(data)

print("CSV with minimal quoting created")

Control quoting behavior with the quoting parameter.

Data Processing

Common operations when working with CSV data.

Filtering and Transforming

import csv

# Read, filter, and transform data
with open("employees.csv", "r") as infile, open("high_salary.csv", "w", newline="") as outfile:
    reader = csv.DictReader(infile)
    fieldnames = reader.fieldnames
    
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    writer.writeheader()
    
    for row in reader:
        # Convert salary to float and filter
        salary = float(row["salary"])
        if salary > 70000:
            # Add bonus column
            row["bonus"] = str(salary * 0.1)
            writer.writerow(row)

print("Filtered and transformed data saved")

Process CSV data on-the-fly during reading/writing.

Data Analysis

import csv
from collections import defaultdict

# Analyze employee data
department_count = defaultdict(int)
department_salary = defaultdict(float)

with open("employees.csv", "r") as file:
    reader = csv.DictReader(file)
    
    for row in reader:
        dept = row["department"]
        salary = float(row["salary"])
        
        department_count[dept] += 1
        department_salary[dept] += salary

# Print analysis
for dept in department_count:
    count = department_count[dept]
    avg_salary = department_salary[dept] / count
    print(f"{dept}: {count} employees, avg salary ${avg_salary:.2f}")

Perform data analysis on CSV content.

Large File Handling

import csv

def process_large_csv(filename, batch_size=1000):
    """Process large CSV files in batches."""
    with open(filename, "r") as file:
        reader = csv.DictReader(file)
        
        batch = []
        for row in reader:
            batch.append(row)
            
            if len(batch) >= batch_size:
                # Process batch
                process_batch(batch)
                batch = []  # Clear batch
        
        # Process remaining rows
        if batch:
            process_batch(batch)

def process_batch(batch):
    """Process a batch of rows."""
    print(f"Processing {len(batch)} rows")
    # Perform operations on batch
    for row in batch:
        # Example: validate data
        if not row["name"]:
            print(f"Warning: Empty name in row")

# Usage
# process_large_csv("large_file.csv")

Handle large CSV files without loading everything into memory.

Error Handling

Robust CSV processing with error handling.

Handling Bad Data

import csv

def safe_csv_reader(filename):
    """Read CSV with error handling."""
    with open(filename, "r") as file:
        reader = csv.reader(file)
        
        for line_num, row in enumerate(reader, 1):
            try:
                # Validate row has expected number of fields
                if len(row) != 3:
                    print(f"Line {line_num}: Expected 3 fields, got {len(row)}")
                    continue
                
                name, age, city = row
                
                # Validate age is numeric
                age = int(age)
                
                print(f"Processed: {name}, {age}, {city}")
                
            except ValueError as e:
                print(f"Line {line_num}: Invalid age '{age}': {e}")
            except Exception as e:
                print(f"Line {line_num}: Unexpected error: {e}")

# Test with problematic data
csv_content = """name,age,city
Alice,30,New York
Bob,not_a_number,Chicago
Charlie,35
Dave,40,Boston,Extra"""

from io import StringIO
csv_file = StringIO(csv_content)
# Temporarily replace file reading for demo
with open("temp.csv", "w") as f:
    f.write(csv_content)

safe_csv_reader("temp.csv")

Handle malformed CSV data gracefully.

Advanced Features

Sniffer for Format Detection

import csv

# CSV with unknown format
sample_data = "name;age;city\nAlice;30;New York\nBob;25;Chicago"

from io import StringIO
csv_file = StringIO(sample_data)

# Use Sniffer to detect format
sniffer = csv.Sniffer()
sample = csv_file.read(1024)
csv_file.seek(0)

# Detect delimiter
delimiter = sniffer.sniff(sample).delimiter
print(f"Detected delimiter: '{delimiter}'")

# Read with detected delimiter
reader = csv.reader(csv_file, delimiter=delimiter)
for row in reader:
    print(row)

csv.Sniffer automatically detects CSV format parameters.

Unicode Handling

import csv

# Data with Unicode characters
data = [
    ["name", "city"],
    ["José", "São Paulo"],
    ["François", "Montréal"],
    ["Михаил", "Москва"]
]

# Write with UTF-8 encoding
with open("unicode.csv", "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerows(data)

print("Unicode CSV created")

# Read with proper encoding
with open("unicode.csv", "r", encoding="utf-8") as file:
    reader = csv.reader(file)
    for row in reader:
        print(row)

Handle international characters with proper encoding.

Performance Tips

Benchmarking

import csv
import time

# Create test data
data = [["col1", "col2", "col3"]] + [[f"row{i}", i, i*2] for i in range(10000)]

# Test writing performance
start = time.time()
with open("benchmark.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerows(data)
write_time = time.time() - start

# Test reading performance
start = time.time()
with open("benchmark.csv", "r") as file:
    rows = list(csv.reader(file))
read_time = time.time() - start

print(f"Writing 10k rows: {write_time:.4f} seconds")
print(f"Reading 10k rows: {read_time:.4f} seconds")

CSV operations are generally fast, but profile for bottlenecks.

Best Practices

  1. Use with statements for file operations
  2. Specify newline="" when writing
  3. Use DictReader/DictWriter for complex data
  4. Handle encoding explicitly for Unicode data
  5. Validate data when reading from external sources
  6. Use appropriate quoting for data with special characters
  7. Consider pandas for complex data analysis

External Resources:

Related Tutorials:

Last updated on