Regular Expressions | Python Interview Questions

What You'll Learn

Basic regex syntax and patterns
Python's re module functions
Capturing groups and backreferences
Common regex patterns for validation
Performance with compiled patterns

Regex Basics

Regular expressions are patterns for matching text. Python uses the re module:

code.pyPython

import re

text = "Hello, World! Hello, Python!"

# search() - Find first match
match = re.search(r"Hello", text)
if match:
    print(match.group())  # "Hello"
    print(match.start())  # 0
    print(match.end())    # 5
    print(match.span())   # (0, 5)

# findall() - Find all matches (returns strings)
matches = re.findall(r"Hello", text)
print(matches)  # ["Hello", "Hello"]

# finditer() - Find all matches (returns match objects)
for match in re.finditer(r"Hello", text):
    print(f"Found at {match.span()}")

# sub() - Replace matches
new_text = re.sub(r"Hello", "Hi", text)
print(new_text)  # "Hi, World! Hi, Python!"

# split() - Split by pattern
parts = re.split(r"[,!]\s*", text)
print(parts)  # ['Hello', 'World', 'Hello', 'Python', '']

Pattern Syntax

Pattern	Matches
`.`	Any character except newline
`\d`	Digit [0-9]
`\D`	Non-digit
`\w`	Word char [a-zA-Z0-9_]
`\W`	Non-word char
`\s`	Whitespace [\t\n\r\f\v ]
`\S`	Non-whitespace
`^`	Start of string
`$`	End of string
`\b`	Word boundary

Quantifiers

Pattern	Meaning
`*`	0 or more
`+`	1 or more
`?`	0 or 1 (optional)
`{n}`	Exactly n times
`{n,}`	n or more times
`{n,m}`	Between n and m times

code.pyPython

import re

# Quantifier examples
text = "I have 3 cats and 12 dogs"

re.findall(r"\d", text)     # ['3', '1', '2']
re.findall(r"\d+", text)    # ['3', '12']
re.findall(r"\d{2}", text)  # ['12']

# Greedy vs non-greedy
html = "<div>content</div>"
re.findall(r"<.*>", html)    # ['<div>content</div>'] greedy
re.findall(r"<.*?>", html)   # ['<div>', '</div>'] non-greedy

Character Classes

code.pyPython

# [abc] - matches a, b, or c
# [^abc] - matches anything except a, b, c
# [a-z] - range a to z
# [A-Za-z0-9] - alphanumeric

import re

text = "The quick brown fox jumps over the lazy dog"

# Find words starting with specific letters
re.findall(r"\b[qf]\w+", text)  # ['quick', 'fox']

# Find vowels
re.findall(r"[aeiou]", text)  # ['e', 'u', 'i', 'o', 'o', 'u', ...]

# Find non-vowels
re.findall(r"[^aeiou\s]", text)  # ['T', 'h', 'q', 'c', 'k', ...]

Groups and Capturing

code.pyPython

import re

# Capturing groups ()
pattern = r"(\w+)@(\w+)\.(\w+)"
match = re.search(pattern, "Contact: john@email.com")

if match:
    print(match.group(0))  # "john@email.com" (entire match)
    print(match.group(1))  # "john"
    print(match.group(2))  # "email"
    print(match.group(3))  # "com"
    print(match.groups())  # ("john", "email", "com")

# Named groups (?P<name>...)
pattern = r"(?P<user>\w+)@(?P<domain>[\w.]+)"
match = re.search(pattern, "john@email.com")

if match:
    print(match.group('user'))    # "john"
    print(match.group('domain'))  # "email.com"
    print(match.groupdict())      # {'user': 'john', 'domain': 'email.com'}

# Non-capturing groups (?:...)
# Groups but doesn't capture
pattern = r"(?:https?://)?(\w+\.\w+)"
match = re.search(pattern, "https://example.com")
print(match.groups())  # ('example.com',)

# Backreferences
# Find repeated words
text = "the the quick quick fox"
duplicates = re.findall(r"\b(\w+)\s+\1\b", text)
print(duplicates)  # ['the', 'quick']

Common Validation Patterns

code.pyPython

import re

# Email validation
email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'

def is_valid_email(email):
    return bool(re.match(email_pattern, email))

print(is_valid_email("user@example.com"))  # True
print(is_valid_email("invalid-email"))     # False

# Phone number (US format)
phone_pattern = r'^\(?\d{3}\)?[-.]?\d{3}[-.]?\d{4}$'
# Matches: 123-456-7890, (123)456-7890, 123.456.7890

# URL validation
url_pattern = r'https?://[\w.-]+(?:/[\w.-]*)*/?'

# Password strength
# At least 8 chars, 1 upper, 1 lower, 1 digit, 1 special
password_pattern = r'^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)(?=.*[@$!%*?&])[A-Za-z\d@$!%*?&]{8,}$'

Flags (Modifiers)

code.pyPython

import re

text = "Hello\nHELLO\nhello"

# re.IGNORECASE (re.I) - Case insensitive
re.findall(r"hello", text, re.I)  # ['Hello', 'HELLO', 'hello']

# re.MULTILINE (re.M) - ^ and $ match line boundaries
re.findall(r"^hello", text, re.I | re.M)  # ['Hello', 'HELLO', 'hello']

# re.DOTALL (re.S) - . matches newline too
re.findall(r"H.*o", text, re.S)  # ['Hello\nHELLO\nhello']

# re.VERBOSE (re.X) - Allow whitespace and comments
pattern = re.compile(r'''
    \d{3}    # Area code
    [-.]?     # Optional separator
    \d{3}    # First 3 digits
    [-.]?     # Optional separator
    \d{4}    # Last 4 digits
''', re.VERBOSE)

Compiled Patterns

code.pyPython

import re

# Compile for reuse (performance optimization)
email_pattern = re.compile(r'^[\w.+-]+@[\w-]+\.[a-z]{2,}$', re.I)

# Use compiled pattern
emails = ["user@example.com", "invalid", "test@test.org"]
for email in emails:
    if email_pattern.match(email):
        print(f"Valid: {email}")

# Access pattern info
print(email_pattern.pattern)  # The pattern string
print(email_pattern.flags)    # The flags used

Interview Tip

When asked about regular expressions:

Use raw strings r"" to avoid escaping backslashes
Know \d, \w, \s and their negations
Understand greedy () vs non-greedy (?) matching
Use named groups for readability
Compile patterns used multiple times