8 min read
ā¢Question 26 of 41mediumRegular Expressions
Pattern matching with regex.
What You'll Learn
- Basic regex syntax and patterns
- Python's re module functions
- Capturing groups and backreferences
- Common regex patterns for validation
- Performance with compiled patterns
Regex Basics
Regular expressions are patterns for matching text. Python uses the re module:
code.pyPython
import re
text = "Hello, World! Hello, Python!"
# search() - Find first match
match = re.search(r"Hello", text)
if match:
print(match.group()) # "Hello"
print(match.start()) # 0
print(match.end()) # 5
print(match.span()) # (0, 5)
# findall() - Find all matches (returns strings)
matches = re.findall(r"Hello", text)
print(matches) # ["Hello", "Hello"]
# finditer() - Find all matches (returns match objects)
for match in re.finditer(r"Hello", text):
print(f"Found at {match.span()}")
# sub() - Replace matches
new_text = re.sub(r"Hello", "Hi", text)
print(new_text) # "Hi, World! Hi, Python!"
# split() - Split by pattern
parts = re.split(r"[,!]\s*", text)
print(parts) # ['Hello', 'World', 'Hello', 'Python', '']Pattern Syntax
| Pattern | Matches |
|---|---|
. | Any character except newline |
\d | Digit [0-9] |
\D | Non-digit |
\w | Word char [a-zA-Z0-9_] |
\W | Non-word char |
\s | Whitespace [\t\n\r\f\v ] |
\S | Non-whitespace |
^ | Start of string |
$ | End of string |
\b | Word boundary |
Quantifiers
| Pattern | Meaning |
|---|---|
* | 0 or more |
+ | 1 or more |
? | 0 or 1 (optional) |
{n} | Exactly n times |
{n,} | n or more times |
{n,m} | Between n and m times |
code.pyPython
import re
# Quantifier examples
text = "I have 3 cats and 12 dogs"
re.findall(r"\d", text) # ['3', '1', '2']
re.findall(r"\d+", text) # ['3', '12']
re.findall(r"\d{2}", text) # ['12']
# Greedy vs non-greedy
html = "<div>content</div>"
re.findall(r"<.*>", html) # ['<div>content</div>'] greedy
re.findall(r"<.*?>", html) # ['<div>', '</div>'] non-greedyCharacter Classes
code.pyPython
# [abc] - matches a, b, or c
# [^abc] - matches anything except a, b, c
# [a-z] - range a to z
# [A-Za-z0-9] - alphanumeric
import re
text = "The quick brown fox jumps over the lazy dog"
# Find words starting with specific letters
re.findall(r"\b[qf]\w+", text) # ['quick', 'fox']
# Find vowels
re.findall(r"[aeiou]", text) # ['e', 'u', 'i', 'o', 'o', 'u', ...]
# Find non-vowels
re.findall(r"[^aeiou\s]", text) # ['T', 'h', 'q', 'c', 'k', ...]Groups and Capturing
code.pyPython
import re
# Capturing groups ()
pattern = r"(\w+)@(\w+)\.(\w+)"
match = re.search(pattern, "Contact: john@email.com")
if match:
print(match.group(0)) # "john@email.com" (entire match)
print(match.group(1)) # "john"
print(match.group(2)) # "email"
print(match.group(3)) # "com"
print(match.groups()) # ("john", "email", "com")
# Named groups (?P<name>...)
pattern = r"(?P<user>\w+)@(?P<domain>[\w.]+)"
match = re.search(pattern, "john@email.com")
if match:
print(match.group('user')) # "john"
print(match.group('domain')) # "email.com"
print(match.groupdict()) # {'user': 'john', 'domain': 'email.com'}
# Non-capturing groups (?:...)
# Groups but doesn't capture
pattern = r"(?:https?://)?(\w+\.\w+)"
match = re.search(pattern, "https://example.com")
print(match.groups()) # ('example.com',)
# Backreferences
# Find repeated words
text = "the the quick quick fox"
duplicates = re.findall(r"\b(\w+)\s+\1\b", text)
print(duplicates) # ['the', 'quick']Common Validation Patterns
code.pyPython
import re
# Email validation
email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
def is_valid_email(email):
return bool(re.match(email_pattern, email))
print(is_valid_email("user@example.com")) # True
print(is_valid_email("invalid-email")) # False
# Phone number (US format)
phone_pattern = r'^\(?\d{3}\)?[-.]?\d{3}[-.]?\d{4}$'
# Matches: 123-456-7890, (123)456-7890, 123.456.7890
# URL validation
url_pattern = r'https?://[\w.-]+(?:/[\w.-]*)*/?'
# Password strength
# At least 8 chars, 1 upper, 1 lower, 1 digit, 1 special
password_pattern = r'^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)(?=.*[@$!%*?&])[A-Za-z\d@$!%*?&]{8,}$'Flags (Modifiers)
code.pyPython
import re
text = "Hello\nHELLO\nhello"
# re.IGNORECASE (re.I) - Case insensitive
re.findall(r"hello", text, re.I) # ['Hello', 'HELLO', 'hello']
# re.MULTILINE (re.M) - ^ and $ match line boundaries
re.findall(r"^hello", text, re.I | re.M) # ['Hello', 'HELLO', 'hello']
# re.DOTALL (re.S) - . matches newline too
re.findall(r"H.*o", text, re.S) # ['Hello\nHELLO\nhello']
# re.VERBOSE (re.X) - Allow whitespace and comments
pattern = re.compile(r'''
\d{3} # Area code
[-.]? # Optional separator
\d{3} # First 3 digits
[-.]? # Optional separator
\d{4} # Last 4 digits
''', re.VERBOSE)Compiled Patterns
code.pyPython
import re
# Compile for reuse (performance optimization)
email_pattern = re.compile(r'^[\w.+-]+@[\w-]+\.[a-z]{2,}$', re.I)
# Use compiled pattern
emails = ["user@example.com", "invalid", "test@test.org"]
for email in emails:
if email_pattern.match(email):
print(f"Valid: {email}")
# Access pattern info
print(email_pattern.pattern) # The pattern string
print(email_pattern.flags) # The flags usedInterview Tip
When asked about regular expressions:
- Use raw strings r"" to avoid escaping backslashes
- Know \d, \w, \s and their negations
- Understand greedy () vs non-greedy (?) matching
- Use named groups for readability
- Compile patterns used multiple times