Python Regular Expressions
Regular expressions (regex) are powerful tools for pattern matching and text manipulation. Python’s re module provides comprehensive regex support for searching, validating, and transforming text.
Introduction to Regex
Regular expressions define patterns to match text.
Basic Matching
import re
# Simple pattern matching
text = "Hello, World!"
pattern = r"Hello"
if re.search(pattern, text):
print("Pattern found!")
else:
print("Pattern not found")Use re.search() to find patterns anywhere in text. Import re module first. See re module documentation.
Raw Strings
# Use raw strings for regex patterns
pattern1 = r"\d+" # Matches digits
pattern2 = "\\d+" # Same as above, but less readable
text = "I have 42 apples"
matches = re.findall(pattern1, text)
print(matches) # ['42']Raw strings (r"") prevent backslash escaping issues. Essential for regex.
Basic Patterns
Common regex patterns for matching characters.
Character Classes
text = "Hello123 World456!"
# Digits
digits = re.findall(r"\d", text) # ['1', '2', '3', '4', '5', '6']
digit_sequences = re.findall(r"\d+", text) # ['123', '456']
# Word characters (letters, digits, underscore)
words = re.findall(r"\w+", text) # ['Hello123', 'World456']
# Whitespace
spaces = re.findall(r"\s", text) # [' ', ' ']
# Non-digits, non-words, non-spaces
non_digits = re.findall(r"\D", text) # ['H', 'e', 'l', 'l', 'o', ',', ' ', 'W', 'o', 'r', 'l', 'd', '!']
print(f"Digits: {digits}")
print(f"Words: {words}")\d, \w, \s match digits, word chars, spaces. Uppercase versions match opposites.
Custom Character Classes
text = "Python 3.9 is awesome!"
# Match vowels
vowels = re.findall(r"[aeiou]", text) # ['o', 'i', 'e', 'o', 'e']
# Match consonants (not vowels)
consonants = re.findall(r"[^aeiou\s\d]", text) # ['P', 'y', 't', 'h', 'n', 's', 'w', 's', 'm']
# Match specific characters
specific = re.findall(r"[aeou]", text) # ['o', 'e', 'o', 'e']
# Match ranges
letters = re.findall(r"[a-zA-Z]", text) # All letters
print(f"Vowels: {vowels}")
print(f"Consonants: {consonants}")[abc] matches any of a, b, c. [^abc] matches anything except a, b, c. [a-z] matches ranges.
Quantifiers
Control how many times patterns repeat.
Greedy Quantifiers
text = "Hellooooo Worldddd"
# Zero or more
zero_or_more = re.findall(r"o*", text) # ['', '', '', '', '', 'ooooo', '', '', '', '']
# One or more
one_or_more = re.findall(r"o+", text) # ['ooooo']
# Zero or one
zero_or_one = re.findall(r"o?", text) # ['', '', '', '', '', 'o', '', '', '', '']
# Exactly n times
exactly_three = re.findall(r"o{3}", text) # ['ooo']
# At least n times
at_least_three = re.findall(r"o{3,}", text) # ['ooooo']
# Between n and m times
between_two_four = re.findall(r"o{2,4}", text) # ['oooo']
print(f"One or more: {one_or_more}")
print(f"Exactly three: {exactly_three}")* (0+), + (1+), ? (0-1), {n}, {n,}, {n,m} control repetition.
Non-Greedy Quantifiers
html = "<div>Hello</div><div>World</div>"
# Greedy (matches too much)
greedy = re.findall(r"<div>.*</div>", html) # ['<div>Hello</div><div>World</div>']
# Non-greedy (matches minimally)
non_greedy = re.findall(r"<div>.*?</div>", html) # ['<div>Hello</div>', '<div>World</div>']
print(f"Greedy: {greedy}")
print(f"Non-greedy: {non_greedy}")Add ? after quantifier for non-greedy matching. Matches as little as possible.
Anchors and Boundaries
Match positions rather than characters.
Position Anchors
text = "Python is fun. Python is powerful."
# Start of string
start = re.findall(r"^Python", text) # ['Python']
# End of string
end = re.findall(r"powerful\.$", text) # ['powerful.']
# Word boundaries
word_bound = re.findall(r"\bPython\b", text) # ['Python', 'Python']
# Not word boundary
not_bound = re.findall(r"Python\B", text) # []
print(f"Start: {start}")
print(f"Word boundaries: {word_bound}")^ matches start, $ matches end, \b matches word boundaries.
Groups and Capturing
Group patterns and capture matched text.
Capturing Groups
text = "John Doe, Jane Smith, Bob Johnson"
# Capture first and last names
names = re.findall(r"(\w+)\s+(\w+)", text) # [('John', 'Doe'), ('Jane', 'Smith'), ('Bob', 'Johnson')]
# Extract email components
email = "[email protected]"
email_match = re.match(r"([^@]+)@([^@]+)", email)
if email_match:
username, domain = email_match.groups()
print(f"Username: {username}, Domain: {domain}")
# Named groups
date_text = "2023-12-25"
date_match = re.search(r"(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})", date_text)
if date_match:
print(f"Year: {date_match.group('year')}")
print(f"Month: {date_match.group('month')}")Parentheses () create capturing groups. Access with .groups() or .group(n).
Non-Capturing Groups
text = "apple orange banana"
# Capturing groups (unnecessary)
capturing = re.findall(r"(?:apple|orange|banana)", text)
# Non-capturing groups (better for alternation)
non_capturing = re.findall(r"(?:apple|orange|banana)", text)
print(f"Fruits: {non_capturing}")(?:...) creates non-capturing groups. Use for alternation without capturing.
Common Regex Functions
Essential functions in the re module.
re.search()
text = "The price is $19.99 and tax is $2.50"
# Find first match
price_match = re.search(r"\$\d+\.\d{2}", text)
if price_match:
print(f"First price: {price_match.group()}") # $19.99
print(f"Position: {price_match.span()}") # (13, 19)Returns match object for first occurrence, or None.
re.findall()
text = "Numbers: 123, 456, 789"
# Find all matches
numbers = re.findall(r"\d+", text) # ['123', '456', '789']
# With groups
pairs = re.findall(r"(\d+),\s*(\d+)", "1,2 3,4 5,6") # [('1', '2'), ('3', '4'), ('5', '6')]
print(f"Numbers: {numbers}")Returns list of all matches (strings) or tuples for groups.
re.finditer()
text = "Python 2.7, Python 3.6, Python 3.9"
# Iterator of match objects
matches = re.finditer(r"Python (\d+\.\d+)", text)
for match in matches:
version = match.group(1)
position = match.start()
print(f"Version {version} at position {position}")Returns iterator of match objects. Useful for large texts.
re.match()
text = "Python is great"
# Match from start only
match = re.match(r"Python", text) # Match object
no_match = re.match(r"great", text) # None
if match:
print(f"Matched: {match.group()}") # PythonOnly matches from string beginning. Use re.search() for anywhere.
re.sub()
text = "I have 2 apples and 3 oranges"
# Replace patterns
censored = re.sub(r"\d+", "X", text) # "I have X apples and X oranges"
# Replace with function result
def double_number(match):
num = int(match.group())
return str(num * 2)
doubled = re.sub(r"\d+", double_number, text) # "I have 4 apples and 6 oranges"
print(f"Censored: {censored}")
print(f"Doubled: {doubled}")Replace matches with strings or function results.
re.split()
text = "apple, banana; orange: grape"
# Split on multiple delimiters
fruits = re.split(r"[,;:]\s*", text) # ['apple', 'banana', 'orange', 'grape']
# Split with capture groups
with_delims = re.split(r"([,;:])\s*", text) # ['apple', ',', 'banana', ';', 'orange', ':', 'grape']
print(f"Fruits: {fruits}")Split strings on regex patterns. Capturing groups include delimiters.
Compilation and Flags
Improve performance and modify behavior.
Compiled Regex
import time
pattern = re.compile(r"\b\w+\b")
# Without compilation
start = time.time()
for _ in range(10000):
re.findall(r"\b\w+\b", "This is a test string")
uncompiled_time = time.time() - start
# With compilation
start = time.time()
for _ in range(10000):
pattern.findall("This is a test string")
compiled_time = time.time() - start
print(f"Uncompiled: {uncompiled_time:.4f}s")
print(f"Compiled: {compiled_time:.4f}s")Compile patterns for repeated use. Much faster for multiple operations.
Regex Flags
text = """Python is great
Java is also good
python is case sensitive"""
# Case insensitive
case_insensitive = re.findall(r"python", text, re.IGNORECASE) # ['Python', 'python']
# Multiline mode
multiline = re.findall(r"^Python", text, re.MULTILINE) # ['Python']
# Dot matches newline
dot_all = re.findall(r"Python.*good", text, re.DOTALL) # ['Python is great\nJava is also good']
# Verbose mode (readable patterns)
pattern = re.compile(r"""
\b\d{1,3} # 1-3 digits
\. # dot
\d{1,3} # 1-3 digits
\. # dot
\d{1,3} # 1-3 digits
\. # dot
\d{1,3} # 1-3 digits
\b
""", re.VERBOSE)
ip = pattern.search("My IP is 192.168.1.1")
if ip:
print(f"IP found: {ip.group()}")
print(f"Case insensitive: {case_insensitive}")Flags modify regex behavior. Combine with | (e.g., re.IGNORECASE | re.MULTILINE).
Practical Examples
Email Validation
def is_valid_email(email):
pattern = r"^[\w\.-]+@[\w\.-]+\.\w+$"
return bool(re.match(pattern, email))
emails = ["[email protected]", "[email protected]", "invalid@", "@domain.com"]
for email in emails:
print(f"{email}: {is_valid_email(email)}")Simple email validation regex. Not perfect but covers basic cases.
Phone Number Extraction
text = """
Call me at (555) 123-4567 or 555-123-4567.
Also try 555.123.4567 or +1-555-123-4567.
"""
# Match various phone formats
phone_pattern = re.compile(r"""
(\+?\d{1,2}[\s.-]?)? # Optional country code
\(?(\d{3})\)?[\s.-]? # Area code (with optional parens)
(\d{3})[\s.-]? # First 3 digits
(\d{4}) # Last 4 digits
""", re.VERBOSE)
phones = phone_pattern.findall(text)
print("Phone numbers found:")
for phone in phones:
print(f"{' '.join(phone).strip()}")Extract phone numbers in various formats.
HTML Tag Removal
html = "<p>This is <b>bold</b> text with <a href='#'>links</a>.</p>"
# Remove HTML tags
clean_text = re.sub(r"<[^>]+>", "", html)
print(f"Clean text: {clean_text}") # "This is bold text with links."Strip HTML tags from text. Simple approach (not perfect for complex HTML).
Log File Parsing
log_line = '2023-12-25 10:30:45 ERROR app.py:123 Database connection failed'
# Parse log entry
pattern = r"(\d{4}-\d{2}-\d{2}) (\d{2}:\d{2}:\d{2}) (\w+) (.+):(\d+) (.+)"
match = re.match(pattern, log_line)
if match:
date, time, level, file, line, message = match.groups()
print(f"Date: {date}")
print(f"Level: {level}")
print(f"Message: {message}")Extract structured data from log files.
Best Practices
- Use raw strings for patterns
- Compile regex for repeated use
- Test patterns thoroughly
- Use non-greedy quantifiers when appropriate
- Consider readability with verbose mode
- Handle edge cases and invalid input
- Use specific patterns rather than overly broad ones
External Resources:
Related Tutorials: