Python Generators
Generators are special functions that return iterators. They allow you to iterate over sequences without storing them in memory, making them memory-efficient for large datasets.
What are Generators?
Generators produce values on-the-fly instead of storing entire sequences.
Generator Functions
def count_up_to(n):
"""Generator that yields numbers from 1 to n."""
count = 1
while count <= n:
yield count
count += 1
# Create generator object
counter = count_up_to(5)
# Iterate through values
for num in counter:
print(num) # Prints 1, 2, 3, 4, 5yield pauses function execution and returns a value. State is preserved between calls. See generator functions in the tutorial.
Generator Expressions
# Generator expression (similar to list comprehension)
squares = (x**2 for x in range(10))
# Use like any iterator
for square in squares:
print(square) # Prints 0, 1, 4, 9, 16, 25, 36, 49, 64, 81
# Convert to list if needed
squares_list = list(squares) # But squares is now exhaustedGenerator expressions use parentheses. More memory-efficient than list comprehensions.
How Generators Work
Generators maintain state and resume execution.
Generator Lifecycle
def simple_generator():
print("Generator started")
yield 1
print("After first yield")
yield 2
print("After second yield")
yield 3
print("Generator finished")
gen = simple_generator()
print("Calling next() first time:")
print(next(gen)) # Generator started \n 1
print("Calling next() second time:")
print(next(gen)) # After first yield \n 2
print("Calling next() third time:")
print(next(gen)) # After second yield \n 3
print("Calling next() fourth time:")
# print(next(gen)) # StopIteration exceptionnext() resumes execution until next yield. StopIteration when exhausted.
Manual Iteration
def fibonacci(limit):
a, b = 0, 1
count = 0
while count < limit:
yield a
a, b = b, a + b
count += 1
fib_gen = fibonacci(10)
# Manual iteration
try:
while True:
print(next(fib_gen), end=" ")
except StopIteration:
print("\nGenerator exhausted")Use next() directly for manual control over iteration.
Memory Efficiency
Generators use minimal memory compared to lists.
Memory Comparison
import sys
# List approach - stores all values
def get_squares_list(n):
return [x**2 for x in range(n)]
# Generator approach - computes on demand
def get_squares_gen(n):
for x in range(n):
yield x**2
# Compare memory usage
n = 1000000
squares_list = get_squares_list(n)
squares_gen = get_squares_gen(n)
print(f"List size: {sys.getsizeof(squares_list)} bytes")
print(f"Generator size: {sys.getsizeof(squares_gen)} bytes")
# List contains all values
print(f"List length: {len(squares_list)}")
# Generator is just an object
print(f"Generator type: {type(squares_gen)}")Generators are much more memory-efficient for large datasets.
Generator Methods
Generators have special methods for control.
send() Method
Send values back into the generator.
def accumulator():
total = 0
while True:
value = (yield total)
if value is not None:
total += value
acc = accumulator()
# Prime the generator
next(acc) # Advance to first yield
print(acc.send(10)) # Send 10, get total (10)
print(acc.send(5)) # Send 5, get total (15)
print(acc.send(3)) # Send 3, get total (18)send() allows two-way communication with generators.
throw() Method
Inject exceptions into generators.
def safe_divider():
try:
while True:
x, y = (yield)
yield x / y
except ZeroDivisionError:
yield "Cannot divide by zero"
divider = safe_divider()
next(divider) # Prime
print(divider.send((10, 2))) # Advance and send values
next(divider) # Get to yield
print(divider.send((10, 0))) # This will cause division by zero
# Throw exception
divider.throw(ZeroDivisionError("Custom error"))
next(divider)
result = next(divider)
print(result) # "Cannot divide by zero"throw() injects exceptions into generator execution.
close() Method
Clean up generator resources.
def file_reader(filename):
try:
with open(filename, 'r') as file:
for line in file:
yield line.strip()
except GeneratorExit:
print("Generator closing, cleaning up...")
# Cleanup code here
reader = file_reader("large_file.txt")
# Use some values
for i, line in enumerate(reader):
print(line)
if i >= 2: # Just read first few lines
reader.close() # Close generator
breakclose() raises GeneratorExit for cleanup.
Advanced Generator Patterns
Chaining Generators
def numbers():
for i in range(10):
yield i
def squares(gen):
for num in gen:
yield num ** 2
def even_squares(gen):
for num in gen:
if num % 2 == 0:
yield num
# Chain generators
result = even_squares(squares(numbers()))
print(list(result)) # [0, 4, 16, 36, 64]Create processing pipelines with chained generators.
Generator with Context Manager
from contextlib import contextmanager
@contextmanager
def database_connection(db_name):
print(f"Connecting to {db_name}")
# Simulate connection
connection = f"Connection to {db_name}"
try:
yield connection
finally:
print(f"Closing connection to {db_name}")
def get_data(connection):
# Simulate data retrieval
yield f"Data from {connection} - Record 1"
yield f"Data from {connection} - Record 2"
yield f"Data from {connection} - Record 3"
with database_connection("mydb") as conn:
for record in get_data(conn):
print(record)Combine generators with context managers for resource management.
itertools Module
Powerful tools for working with iterators and generators.
Infinite Generators
import itertools
# Infinite count
counter = itertools.count(start=10, step=2)
print(list(itertools.islice(counter, 5))) # [10, 12, 14, 16, 18]
# Infinite cycle
cycler = itertools.cycle(['A', 'B', 'C'])
print(list(itertools.islice(cycler, 7))) # ['A', 'B', 'C', 'A', 'B', 'C', 'A']
# Infinite repeat
repeater = itertools.repeat('Hello', 3)
print(list(repeater)) # ['Hello', 'Hello', 'Hello']Create infinite sequences with itertools. Use islice() to limit them.
Combinatorics
import itertools
# Permutations
items = ['A', 'B', 'C']
perms = list(itertools.permutations(items, 2))
print(perms) # [('A', 'B'), ('A', 'C'), ('B', 'A'), ('B', 'C'), ('C', 'A'), ('C', 'B')]
# Combinations
combs = list(itertools.combinations(items, 2))
print(combs) # [('A', 'B'), ('A', 'C'), ('B', 'C')]
# Cartesian product
product = list(itertools.product(['X', 'Y'], [1, 2]))
print(product) # [('X', 1), ('X', 2), ('Y', 1), ('Y', 2)]Generate combinations, permutations, and products efficiently.
Grouping and Filtering
import itertools
# Group by key
data = [
('A', 1), ('A', 2), ('B', 3), ('B', 4), ('C', 5)
]
for key, group in itertools.groupby(data, key=lambda x: x[0]):
print(f"{key}: {list(group)}")
# Filter false values
data = [0, 1, False, 2, '', 3, None, 4]
filtered = list(itertools.filterfalse(lambda x: not x, data))
print(filtered) # [1, 2, 3, 4]Group and filter data with itertools functions.
Performance Considerations
When to Use Generators
import time
def list_approach(n):
# Create entire list in memory
return [i**2 for i in range(n)]
def generator_approach(n):
# Generate values on demand
for i in range(n):
yield i**2
# Time comparison
n = 1000000
start = time.time()
result_list = list_approach(n)
list_time = time.time() - start
start = time.time()
result_gen = list(generator_approach(n))
gen_time = time.time() - start
print(f"List approach: {list_time:.4f} seconds")
print(f"Generator approach: {gen_time:.4f} seconds")
print(f"Memory: List uses {len(result_list)} items at once")Generators are faster and use less memory for large datasets.
Generator Gotchas
# Gotcha 1: Generators are exhausted
gen = (x for x in range(3))
print(list(gen)) # [0, 1, 2]
print(list(gen)) # [] - exhausted!
# Gotcha 2: Can't get length
gen = (x for x in range(100))
# len(gen) # TypeError!
# Gotcha 3: Can't index
# gen[0] # TypeError!
# Solutions
gen = (x for x in range(3))
gen_list = list(gen) # Convert to list if needed
print(len(gen_list))
print(gen_list[0])Generators can only be consumed once. Convert to list if you need multiple iterations.
Real-World Examples
Reading Large Files
def read_large_file(filename, chunk_size=1024):
"""Read large file in chunks without loading entirely in memory."""
with open(filename, 'rb') as file:
while chunk := file.read(chunk_size):
yield chunk
# Process large file
for chunk in read_large_file("huge_file.dat"):
process_chunk(chunk) # Process each chunkProcess large files without memory issues.
Data Pipeline
def read_csv(filename):
"""Read CSV lines."""
with open(filename, 'r') as file:
next(file) # Skip header
for line in file:
yield line.strip().split(',')
def filter_valid_data(rows):
"""Filter out invalid rows."""
for row in rows:
if len(row) == 3 and all(field for field in row):
yield row
def convert_to_dict(rows):
"""Convert rows to dictionaries."""
for row in rows:
yield {
'name': row[0],
'age': int(row[1]),
'city': row[2]
}
# Create processing pipeline
data_pipeline = convert_to_dict(filter_valid_data(read_csv("data.csv")))
for record in data_pipeline:
print(record)Create efficient data processing pipelines.
Best Practices
- Use generators for large datasets or infinite sequences
- Prefer generator expressions over list comprehensions when possible
- Use
itertoolsfor complex iterator operations - Handle
StopIterationappropriately - Consider memory usage when choosing between lists and generators
- Use
yield fromfor delegating to sub-generators
External Resources:
Related Tutorials: