#1 Data Analytics Program in India
₹2,499₹1,499Enroll Now
8 min read
•Question 26 of 41medium

Regular Expressions

Pattern matching with regex.

What You'll Learn

  • Basic regex syntax and patterns
  • Python's re module functions
  • Capturing groups and backreferences
  • Common regex patterns for validation
  • Performance with compiled patterns

Regex Basics

Regular expressions are patterns for matching text. Python uses the re module:

code.pyPython
import re

text = "Hello, World! Hello, Python!"

# search() - Find first match
match = re.search(r"Hello", text)
if match:
    print(match.group())  # "Hello"
    print(match.start())  # 0
    print(match.end())    # 5
    print(match.span())   # (0, 5)

# findall() - Find all matches (returns strings)
matches = re.findall(r"Hello", text)
print(matches)  # ["Hello", "Hello"]

# finditer() - Find all matches (returns match objects)
for match in re.finditer(r"Hello", text):
    print(f"Found at {match.span()}")

# sub() - Replace matches
new_text = re.sub(r"Hello", "Hi", text)
print(new_text)  # "Hi, World! Hi, Python!"

# split() - Split by pattern
parts = re.split(r"[,!]\s*", text)
print(parts)  # ['Hello', 'World', 'Hello', 'Python', '']

Pattern Syntax

PatternMatches
.Any character except newline
\dDigit [0-9]
\DNon-digit
\wWord char [a-zA-Z0-9_]
\WNon-word char
\sWhitespace [\t\n\r\f\v ]
\SNon-whitespace
^Start of string
$End of string
\bWord boundary

Quantifiers

PatternMeaning
*0 or more
+1 or more
?0 or 1 (optional)
{n}Exactly n times
{n,}n or more times
{n,m}Between n and m times
code.pyPython
import re

# Quantifier examples
text = "I have 3 cats and 12 dogs"

re.findall(r"\d", text)     # ['3', '1', '2']
re.findall(r"\d+", text)    # ['3', '12']
re.findall(r"\d{2}", text)  # ['12']

# Greedy vs non-greedy
html = "<div>content</div>"
re.findall(r"<.*>", html)    # ['<div>content</div>'] greedy
re.findall(r"<.*?>", html)   # ['<div>', '</div>'] non-greedy

Character Classes

code.pyPython
# [abc] - matches a, b, or c
# [^abc] - matches anything except a, b, c
# [a-z] - range a to z
# [A-Za-z0-9] - alphanumeric

import re

text = "The quick brown fox jumps over the lazy dog"

# Find words starting with specific letters
re.findall(r"\b[qf]\w+", text)  # ['quick', 'fox']

# Find vowels
re.findall(r"[aeiou]", text)  # ['e', 'u', 'i', 'o', 'o', 'u', ...]

# Find non-vowels
re.findall(r"[^aeiou\s]", text)  # ['T', 'h', 'q', 'c', 'k', ...]

Groups and Capturing

code.pyPython
import re

# Capturing groups ()
pattern = r"(\w+)@(\w+)\.(\w+)"
match = re.search(pattern, "Contact: john@email.com")

if match:
    print(match.group(0))  # "john@email.com" (entire match)
    print(match.group(1))  # "john"
    print(match.group(2))  # "email"
    print(match.group(3))  # "com"
    print(match.groups())  # ("john", "email", "com")

# Named groups (?P<name>...)
pattern = r"(?P<user>\w+)@(?P<domain>[\w.]+)"
match = re.search(pattern, "john@email.com")

if match:
    print(match.group('user'))    # "john"
    print(match.group('domain'))  # "email.com"
    print(match.groupdict())      # {'user': 'john', 'domain': 'email.com'}

# Non-capturing groups (?:...)
# Groups but doesn't capture
pattern = r"(?:https?://)?(\w+\.\w+)"
match = re.search(pattern, "https://example.com")
print(match.groups())  # ('example.com',)

# Backreferences
# Find repeated words
text = "the the quick quick fox"
duplicates = re.findall(r"\b(\w+)\s+\1\b", text)
print(duplicates)  # ['the', 'quick']

Common Validation Patterns

code.pyPython
import re

# Email validation
email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'

def is_valid_email(email):
    return bool(re.match(email_pattern, email))

print(is_valid_email("user@example.com"))  # True
print(is_valid_email("invalid-email"))     # False

# Phone number (US format)
phone_pattern = r'^\(?\d{3}\)?[-.]?\d{3}[-.]?\d{4}$'
# Matches: 123-456-7890, (123)456-7890, 123.456.7890

# URL validation
url_pattern = r'https?://[\w.-]+(?:/[\w.-]*)*/?'

# Password strength
# At least 8 chars, 1 upper, 1 lower, 1 digit, 1 special
password_pattern = r'^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)(?=.*[@$!%*?&])[A-Za-z\d@$!%*?&]{8,}$'

Flags (Modifiers)

code.pyPython
import re

text = "Hello\nHELLO\nhello"

# re.IGNORECASE (re.I) - Case insensitive
re.findall(r"hello", text, re.I)  # ['Hello', 'HELLO', 'hello']

# re.MULTILINE (re.M) - ^ and $ match line boundaries
re.findall(r"^hello", text, re.I | re.M)  # ['Hello', 'HELLO', 'hello']

# re.DOTALL (re.S) - . matches newline too
re.findall(r"H.*o", text, re.S)  # ['Hello\nHELLO\nhello']

# re.VERBOSE (re.X) - Allow whitespace and comments
pattern = re.compile(r'''
    \d{3}    # Area code
    [-.]?     # Optional separator
    \d{3}    # First 3 digits
    [-.]?     # Optional separator
    \d{4}    # Last 4 digits
''', re.VERBOSE)

Compiled Patterns

code.pyPython
import re

# Compile for reuse (performance optimization)
email_pattern = re.compile(r'^[\w.+-]+@[\w-]+\.[a-z]{2,}$', re.I)

# Use compiled pattern
emails = ["user@example.com", "invalid", "test@test.org"]
for email in emails:
    if email_pattern.match(email):
        print(f"Valid: {email}")

# Access pattern info
print(email_pattern.pattern)  # The pattern string
print(email_pattern.flags)    # The flags used

Interview Tip

When asked about regular expressions:

  1. Use raw strings r"" to avoid escaping backslashes
  2. Know \d, \w, \s and their negations
  3. Understand greedy () vs non-greedy (?) matching
  4. Use named groups for readability
  5. Compile patterns used multiple times