Python Regular Expressions

Python Regular Expressions

Regular expressions (regex) are powerful tools for pattern matching and text manipulation. Python’s re module provides comprehensive regex support for searching, validating, and transforming text.

Introduction to Regex

Regular expressions define patterns to match text.

Basic Matching

import re

# Simple pattern matching
text = "Hello, World!"
pattern = r"Hello"

if re.search(pattern, text):
    print("Pattern found!")
else:
    print("Pattern not found")

Use re.search() to find patterns anywhere in text. Import re module first. See re module documentation.

Raw Strings

# Use raw strings for regex patterns
pattern1 = r"\d+"  # Matches digits
pattern2 = "\\d+"  # Same as above, but less readable

text = "I have 42 apples"
matches = re.findall(pattern1, text)
print(matches)  # ['42']

Raw strings (r"") prevent backslash escaping issues. Essential for regex.

Basic Patterns

Common regex patterns for matching characters.

Character Classes

text = "Hello123 World456!"

# Digits
digits = re.findall(r"\d", text)  # ['1', '2', '3', '4', '5', '6']
digit_sequences = re.findall(r"\d+", text)  # ['123', '456']

# Word characters (letters, digits, underscore)
words = re.findall(r"\w+", text)  # ['Hello123', 'World456']

# Whitespace
spaces = re.findall(r"\s", text)  # [' ', ' ']

# Non-digits, non-words, non-spaces
non_digits = re.findall(r"\D", text)  # ['H', 'e', 'l', 'l', 'o', ',', ' ', 'W', 'o', 'r', 'l', 'd', '!']

print(f"Digits: {digits}")
print(f"Words: {words}")

\d, \w, \s match digits, word chars, spaces. Uppercase versions match opposites.

Custom Character Classes

text = "Python 3.9 is awesome!"

# Match vowels
vowels = re.findall(r"[aeiou]", text)  # ['o', 'i', 'e', 'o', 'e']

# Match consonants (not vowels)
consonants = re.findall(r"[^aeiou\s\d]", text)  # ['P', 'y', 't', 'h', 'n', 's', 'w', 's', 'm']

# Match specific characters
specific = re.findall(r"[aeou]", text)  # ['o', 'e', 'o', 'e']

# Match ranges
letters = re.findall(r"[a-zA-Z]", text)  # All letters

print(f"Vowels: {vowels}")
print(f"Consonants: {consonants}")

[abc] matches any of a, b, c. [^abc] matches anything except a, b, c. [a-z] matches ranges.

Quantifiers

Control how many times patterns repeat.

Greedy Quantifiers

text = "Hellooooo Worldddd"

# Zero or more
zero_or_more = re.findall(r"o*", text)  # ['', '', '', '', '', 'ooooo', '', '', '', '']

# One or more
one_or_more = re.findall(r"o+", text)  # ['ooooo']

# Zero or one
zero_or_one = re.findall(r"o?", text)  # ['', '', '', '', '', 'o', '', '', '', '']

# Exactly n times
exactly_three = re.findall(r"o{3}", text)  # ['ooo']

# At least n times
at_least_three = re.findall(r"o{3,}", text)  # ['ooooo']

# Between n and m times
between_two_four = re.findall(r"o{2,4}", text)  # ['oooo']

print(f"One or more: {one_or_more}")
print(f"Exactly three: {exactly_three}")

* (0+), + (1+), ? (0-1), {n}, {n,}, {n,m} control repetition.

Non-Greedy Quantifiers

html = "<div>Hello</div><div>World</div>"

# Greedy (matches too much)
greedy = re.findall(r"<div>.*</div>", html)  # ['<div>Hello</div><div>World</div>']

# Non-greedy (matches minimally)
non_greedy = re.findall(r"<div>.*?</div>", html)  # ['<div>Hello</div>', '<div>World</div>']

print(f"Greedy: {greedy}")
print(f"Non-greedy: {non_greedy}")

Add ? after quantifier for non-greedy matching. Matches as little as possible.

Anchors and Boundaries

Match positions rather than characters.

Position Anchors

text = "Python is fun. Python is powerful."

# Start of string
start = re.findall(r"^Python", text)  # ['Python']

# End of string
end = re.findall(r"powerful\.$", text)  # ['powerful.']

# Word boundaries
word_bound = re.findall(r"\bPython\b", text)  # ['Python', 'Python']

# Not word boundary
not_bound = re.findall(r"Python\B", text)  # []

print(f"Start: {start}")
print(f"Word boundaries: {word_bound}")

^ matches start, $ matches end, \b matches word boundaries.

Groups and Capturing

Group patterns and capture matched text.

Capturing Groups

text = "John Doe, Jane Smith, Bob Johnson"

# Capture first and last names
names = re.findall(r"(\w+)\s+(\w+)", text)  # [('John', 'Doe'), ('Jane', 'Smith'), ('Bob', 'Johnson')]

# Extract email components
email = "[email protected]"
email_match = re.match(r"([^@]+)@([^@]+)", email)
if email_match:
    username, domain = email_match.groups()
    print(f"Username: {username}, Domain: {domain}")

# Named groups
date_text = "2023-12-25"
date_match = re.search(r"(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})", date_text)
if date_match:
    print(f"Year: {date_match.group('year')}")
    print(f"Month: {date_match.group('month')}")

Parentheses () create capturing groups. Access with .groups() or .group(n).

Non-Capturing Groups

text = "apple orange banana"

# Capturing groups (unnecessary)
capturing = re.findall(r"(?:apple|orange|banana)", text)

# Non-capturing groups (better for alternation)
non_capturing = re.findall(r"(?:apple|orange|banana)", text)

print(f"Fruits: {non_capturing}")

(?:...) creates non-capturing groups. Use for alternation without capturing.

Common Regex Functions

Essential functions in the re module.

re.search()

text = "The price is $19.99 and tax is $2.50"

# Find first match
price_match = re.search(r"\$\d+\.\d{2}", text)
if price_match:
    print(f"First price: {price_match.group()}")  # $19.99
    print(f"Position: {price_match.span()}")      # (13, 19)

Returns match object for first occurrence, or None.

re.findall()

text = "Numbers: 123, 456, 789"

# Find all matches
numbers = re.findall(r"\d+", text)  # ['123', '456', '789']

# With groups
pairs = re.findall(r"(\d+),\s*(\d+)", "1,2 3,4 5,6")  # [('1', '2'), ('3', '4'), ('5', '6')]

print(f"Numbers: {numbers}")

Returns list of all matches (strings) or tuples for groups.

re.finditer()

text = "Python 2.7, Python 3.6, Python 3.9"

# Iterator of match objects
matches = re.finditer(r"Python (\d+\.\d+)", text)

for match in matches:
    version = match.group(1)
    position = match.start()
    print(f"Version {version} at position {position}")

Returns iterator of match objects. Useful for large texts.

re.match()

text = "Python is great"

# Match from start only
match = re.match(r"Python", text)  # Match object
no_match = re.match(r"great", text)  # None

if match:
    print(f"Matched: {match.group()}")  # Python

Only matches from string beginning. Use re.search() for anywhere.

re.sub()

text = "I have 2 apples and 3 oranges"

# Replace patterns
censored = re.sub(r"\d+", "X", text)  # "I have X apples and X oranges"

# Replace with function result
def double_number(match):
    num = int(match.group())
    return str(num * 2)

doubled = re.sub(r"\d+", double_number, text)  # "I have 4 apples and 6 oranges"

print(f"Censored: {censored}")
print(f"Doubled: {doubled}")

Replace matches with strings or function results.

re.split()

text = "apple, banana; orange: grape"

# Split on multiple delimiters
fruits = re.split(r"[,;:]\s*", text)  # ['apple', 'banana', 'orange', 'grape']

# Split with capture groups
with_delims = re.split(r"([,;:])\s*", text)  # ['apple', ',', 'banana', ';', 'orange', ':', 'grape']

print(f"Fruits: {fruits}")

Split strings on regex patterns. Capturing groups include delimiters.

Compilation and Flags

Improve performance and modify behavior.

Compiled Regex

import time

pattern = re.compile(r"\b\w+\b")

# Without compilation
start = time.time()
for _ in range(10000):
    re.findall(r"\b\w+\b", "This is a test string")
uncompiled_time = time.time() - start

# With compilation
start = time.time()
for _ in range(10000):
    pattern.findall("This is a test string")
compiled_time = time.time() - start

print(f"Uncompiled: {uncompiled_time:.4f}s")
print(f"Compiled: {compiled_time:.4f}s")

Compile patterns for repeated use. Much faster for multiple operations.

Regex Flags

text = """Python is great
Java is also good
python is case sensitive"""

# Case insensitive
case_insensitive = re.findall(r"python", text, re.IGNORECASE)  # ['Python', 'python']

# Multiline mode
multiline = re.findall(r"^Python", text, re.MULTILINE)  # ['Python']

# Dot matches newline
dot_all = re.findall(r"Python.*good", text, re.DOTALL)  # ['Python is great\nJava is also good']

# Verbose mode (readable patterns)
pattern = re.compile(r"""
    \b\d{1,3}  # 1-3 digits
    \.         # dot
    \d{1,3}    # 1-3 digits  
    \.         # dot
    \d{1,3}    # 1-3 digits
    \.         # dot
    \d{1,3}    # 1-3 digits
    \b
""", re.VERBOSE)

ip = pattern.search("My IP is 192.168.1.1")
if ip:
    print(f"IP found: {ip.group()}")

print(f"Case insensitive: {case_insensitive}")

Flags modify regex behavior. Combine with | (e.g., re.IGNORECASE | re.MULTILINE).

Practical Examples

Email Validation

def is_valid_email(email):
    pattern = r"^[\w\.-]+@[\w\.-]+\.\w+$"
    return bool(re.match(pattern, email))

emails = ["[email protected]", "[email protected]", "invalid@", "@domain.com"]
for email in emails:
    print(f"{email}: {is_valid_email(email)}")

Simple email validation regex. Not perfect but covers basic cases.

Phone Number Extraction

text = """
Call me at (555) 123-4567 or 555-123-4567.
Also try 555.123.4567 or +1-555-123-4567.
"""

# Match various phone formats
phone_pattern = re.compile(r"""
    (\+?\d{1,2}[\s.-]?)?  # Optional country code
    \(?(\d{3})\)?[\s.-]?   # Area code (with optional parens)
    (\d{3})[\s.-]?         # First 3 digits
    (\d{4})                # Last 4 digits
""", re.VERBOSE)

phones = phone_pattern.findall(text)
print("Phone numbers found:")
for phone in phones:
    print(f"{' '.join(phone).strip()}")

Extract phone numbers in various formats.

HTML Tag Removal

html = "<p>This is <b>bold</b> text with <a href='#'>links</a>.</p>"

# Remove HTML tags
clean_text = re.sub(r"<[^>]+>", "", html)
print(f"Clean text: {clean_text}")  # "This is bold text with links."

Strip HTML tags from text. Simple approach (not perfect for complex HTML).

Log File Parsing

log_line = '2023-12-25 10:30:45 ERROR app.py:123 Database connection failed'

# Parse log entry
pattern = r"(\d{4}-\d{2}-\d{2}) (\d{2}:\d{2}:\d{2}) (\w+) (.+):(\d+) (.+)"
match = re.match(pattern, log_line)

if match:
    date, time, level, file, line, message = match.groups()
    print(f"Date: {date}")
    print(f"Level: {level}")
    print(f"Message: {message}")

Extract structured data from log files.

Best Practices

  1. Use raw strings for patterns
  2. Compile regex for repeated use
  3. Test patterns thoroughly
  4. Use non-greedy quantifiers when appropriate
  5. Consider readability with verbose mode
  6. Handle edge cases and invalid input
  7. Use specific patterns rather than overly broad ones

External Resources:

Related Tutorials:

Last updated on