Easy📖Теория8 min

Регулярные выражения

Модуль re: match, search, findall, sub, compile, группы, lookahead и lookbehind

Регулярные выражения

Регулярные выражения (regex) -- мощный инструмент для поиска, проверки и трансформации текста по шаблонам. В Python регулярные выражения доступны через модуль re.

Основы модуля re

import re

text = "Мой телефон: +7 (999) 123-45-67, рабочий: +7 (495) 987-65-43"

# re.search() - find first match
match = re.search(r'\d{3}-\d{2}-\d{2}', text)
if match:
    print(match.group())  # 123-45-67
    print(match.start())  # 23 (start index)
    print(match.end())    # 32 (end index)
    print(match.span())   # (23, 32)

# re.match() - match at the beginning of string only
print(re.match(r'\d+', "123abc"))   # <Match: '123'>
print(re.match(r'\d+', "abc123"))   # None (no match at start)

# re.fullmatch() - match entire string
print(re.fullmatch(r'\d+', "123"))     # <Match: '123'>
print(re.fullmatch(r'\d+', "123abc"))  # None

re.findall() и re.finditer()

import re

text = "Цены: 100 руб, 250 руб, 1500 руб, 42.50 руб"

# findall() returns a list of all matches
prices = re.findall(r'\d+\.?\d*', text)
print(prices)  # ['100', '250', '1500', '42.50']

# finditer() returns an iterator of Match objects
for match in re.finditer(r'\d+\.?\d*', text):
    print(f"Цена: {match.group()} на позиции {match.span()}")

# findall with groups - returns only captured groups
text = "Дата: 2026-03-30, обновлено: 2026-04-01"
dates = re.findall(r'(\d{4})-(\d{2})-(\d{2})', text)
print(dates)  # [('2026', '03', '30'), ('2026', '04', '01')]

Синтаксис регулярных выражений

Символьные классы

import re

# . (dot) - any character except newline
print(re.findall(r'c.t', 'cat cot cut c\nt'))  # ['cat', 'cot', 'cut']

# \d - digit [0-9], \D - non-digit
print(re.findall(r'\d+', 'abc123def456'))  # ['123', '456']

# \w - word character [a-zA-Z0-9_], \W - non-word
print(re.findall(r'\w+', 'hello, world! 42'))  # ['hello', 'world', '42']

# \s - whitespace, \S - non-whitespace
print(re.findall(r'\S+', 'hello  world'))  # ['hello', 'world']

# Custom character classes
print(re.findall(r'[aeiou]', 'hello world'))   # ['e', 'o', 'o']
print(re.findall(r'[^aeiou\s]', 'hello'))      # ['h', 'l', 'l'] (negation)
print(re.findall(r'[a-fA-F0-9]+', '0xFF 0x1A')) # ['0xFF', '0x1A']

# Russian letters
print(re.findall(r'[а-яА-ЯёЁ]+', 'Привет мир hello'))
# ['Привет', 'мир']

Квантификаторы

import re

# * (0 or more), + (1 or more), ? (0 or 1)
print(re.findall(r'ab*c', 'ac abc abbc'))    # ['ac', 'abc', 'abbc']
print(re.findall(r'ab+c', 'ac abc abbc'))    # ['abc', 'abbc']
print(re.findall(r'ab?c', 'ac abc abbc'))    # ['ac', 'abc']

# {n} exactly n, {n,m} from n to m
print(re.findall(r'\d{3}', '1 12 123 1234'))   # ['123', '123']
print(re.findall(r'\d{2,4}', '1 12 123 1234')) # ['12', '123', '1234']

# Greedy vs lazy (non-greedy)
html = '<b>bold</b> and <i>italic</i>'
print(re.findall(r'<.*>', html))     # ['<b>bold</b> and <i>italic</i>'] (greedy)
print(re.findall(r'<.*?>', html))    # ['<b>', '</b>', '<i>', '</i>'] (lazy)
print(re.findall(r'<.+?>', html))    # ['<b>', '</b>', '<i>', '</i>']

Якоря и границы

import re

# ^ start of string, $ end of string
print(re.findall(r'^\w+', 'hello world'))   # ['hello']
print(re.findall(r'\w+$', 'hello world'))   # ['world']

# \b word boundary
text = "cat category concatenate"
print(re.findall(r'\bcat\b', text))     # ['cat'] (exact word)
print(re.findall(r'\bcat\w*', text))    # ['cat', 'category', 'concatenate']

# Multiline mode
multiline = "first\nsecond\nthird"
print(re.findall(r'^\w+', multiline, re.MULTILINE))
# ['first', 'second', 'third']

Группы

import re

# Numbered groups
match = re.search(r'(\d{4})-(\d{2})-(\d{2})', '2026-03-30')
if match:
    print(match.group(0))  # '2026-03-30' (full match)
    print(match.group(1))  # '2026' (first group)
    print(match.group(2))  # '03'
    print(match.group(3))  # '30'
    print(match.groups())  # ('2026', '03', '30')

# Named groups (?P<name>...)
pattern = r'(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})'
match = re.search(pattern, '2026-03-30')
if match:
    print(match.group('year'))   # '2026'
    print(match.group('month'))  # '03'
    print(match.groupdict())     # {'year': '2026', 'month': '03', 'day': '30'}

# Non-capturing group (?:...)
# Useful when you need grouping but don't need to capture
emails = "[email protected] [email protected]"
pattern = r'\w+@(?:gmail|yahoo)\.com'
print(re.findall(pattern, emails))  # ['[email protected]', '[email protected]']
# Note: findall returns full match when no capturing groups

# Backreferences \1
# Find repeated words
text = "the the cat sat on on the mat"
pattern = r'\b(\w+)\s+\1\b'
print(re.findall(pattern, text))  # ['the', 'on']

# Named backreference
pattern = r'\b(?P<word>\w+)\s+(?P=word)\b'
print(re.findall(pattern, text))  # ['the', 'on']

Альтернация (|)

import re

# Match either pattern
text = "cat dog bird cat fish"
print(re.findall(r'cat|dog', text))  # ['cat', 'dog', 'cat']

# Group alternatives
text = "gray grey"
print(re.findall(r'gr[ae]y', text))     # ['gray', 'grey']
print(re.findall(r'gr(?:a|e)y', text))  # ['gray', 'grey'] (same)

# Complex alternatives
pattern = r'(?:https?|ftp)://\S+'
urls = "Visit https://python.org or ftp://files.example.com"
print(re.findall(pattern, urls))
# ['https://python.org', 'ftp://files.example.com']

Lookahead и Lookbehind

Утверждения нулевой ширины -- проверяют контекст, не включая его в совпадение:

import re

# Positive lookahead (?=...)
# Find word followed by a number
text = "item1 item2 thing item3 stuff"
print(re.findall(r'\w+(?=\d)', text))  # ['item', 'item', 'item']

# Negative lookahead (?!...)
# Find "item" NOT followed by a digit
text = "item1 item item3 items"
print(re.findall(r'item(?!\d)', text))  # ['item', 'item']

# Positive lookbehind (?<=...)
# Find numbers preceded by $
text = "Price: $100, Cost: €200, Total: $350"
print(re.findall(r'(?<=\$)\d+', text))  # ['100', '350']

# Negative lookbehind (?<!...)
# Find numbers NOT preceded by $
print(re.findall(r'(?<!\$)\d+', text))  # ['200']

# Practical: password validation
def validate_password(password: str) -> bool:
    """Check password has >= 8 chars, uppercase, lowercase, digit."""
    pattern = r'^(?=.*[a-z])(?=.*[A-Z])(?=.*\d).{8,}$'
    return bool(re.match(pattern, password))

print(validate_password("Abc12345"))   # True
print(validate_password("abc12345"))   # False (no uppercase)
print(validate_password("Short1"))     # False (< 8 chars)

re.sub() и замена

import re

# Basic replacement
text = "Дата: 30-03-2026"
result = re.sub(r'(\d{2})-(\d{2})-(\d{4})', r'\3-\2-\1', text)
print(result)  # Дата: 2026-03-30

# Named group replacement
result = re.sub(
    r'(?P<day>\d{2})-(?P<month>\d{2})-(?P<year>\d{4})',
    r'\g<year>/\g<month>/\g<day>',
    text
)
print(result)  # Дата: 2026/03/30

# Replacement with a function
def censor_email(match):
    """Replace email with censored version."""
    local, domain = match.group(1), match.group(2)
    return f"{local[0]}***@{domain}"

text = "Контакты: [email protected] и [email protected]"
result = re.sub(r'(\w+)@(\w+\.\w+)', censor_email, text)
print(result)  # Контакты: i***@mail.ru и m***@gmail.com

# re.subn() - returns count of replacements
text = "aaa bbb aaa"
result, count = re.subn(r'aaa', 'xxx', text)
print(f"{result} ({count} замен)")  # xxx bbb xxx (2 замен)

# Limit replacements
print(re.sub(r'\d', 'X', 'a1b2c3', count=2))  # aXbXc3

Компиляция и флаги

import re

# Compile pattern for reuse (faster for repeated use)
phone_pattern = re.compile(r'\+?\d[\d\s\-()]{7,}\d')
email_pattern = re.compile(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}')

text = "Звоните: +7 (999) 123-45-67 или пишите: [email protected]"
print(phone_pattern.findall(text))  # ['+7 (999) 123-45-67']
print(email_pattern.findall(text))  # ['[email protected]']

# Flags
# re.IGNORECASE (re.I) - case-insensitive
print(re.findall(r'python', 'Python PYTHON python', re.I))
# ['Python', 'PYTHON', 'python']

# re.MULTILINE (re.M) - ^ and $ match line boundaries
text = "first line\nsecond line\nthird line"
print(re.findall(r'^\w+', text, re.M))  # ['first', 'second', 'third']

# re.DOTALL (re.S) - . matches newline too
text = "<div>\nContent\n</div>"
print(re.findall(r'<div>.*?</div>', text, re.S))
# ['<div>\nContent\n</div>']

# re.VERBOSE (re.X) - allow comments and whitespace
pattern = re.compile(r"""
    (?P<protocol>https?://)   # Protocol (http or https)
    (?P<domain>[^/\s]+)       # Domain name
    (?P<path>/\S*)?           # Optional path
""", re.VERBOSE)

match = pattern.search("Visit https://python.org/docs for details")
if match:
    print(match.groupdict())
    # {'protocol': 'https://', 'domain': 'python.org', 'path': '/docs'}

# Combine flags
pattern = re.compile(r'hello', re.I | re.M)

Практические примеры

import re

# Validate email
def is_valid_email(email: str) -> bool:
    pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    return bool(re.match(pattern, email))

# Extract all URLs
def extract_urls(text: str) -> list[str]:
    pattern = r'https?://[^\s<>"{}|\\^`\[\]]+'
    return re.findall(pattern, text)

# Clean and normalize whitespace
def normalize_whitespace(text: str) -> str:
    return re.sub(r'\s+', ' ', text).strip()

print(normalize_whitespace("  too   many   spaces  "))
# 'too many spaces'

# Parse log entries
log = "2026-03-30 14:30:45 [ERROR] Connection timeout for host 10.0.0.1"
pattern = r'(?P<date>\S+) (?P<time>\S+) \[(?P<level>\w+)\] (?P<message>.+)'
match = re.match(pattern, log)
if match:
    print(match.groupdict())
    # {'date': '2026-03-30', 'time': '14:30:45',
    #  'level': 'ERROR', 'message': 'Connection timeout for host 10.0.0.1'}

# Convert camelCase to snake_case
def camel_to_snake(name: str) -> str:
    s = re.sub(r'(?<!^)(?=[A-Z])', '_', name)
    return s.lower()

print(camel_to_snake("camelCaseVariable"))  # camel_case_variable
print(camel_to_snake("HTMLParser"))         # h_t_m_l_parser

Итоги

  • re.search() -- первое совпадение в любом месте строки
  • re.match() -- совпадение только в начале строки
  • re.findall() -- все совпадения, re.finditer() -- итератор Match-объектов
  • re.sub() -- замена с поддержкой групп и функций
  • Группы: (...) захват, (?:...) без захвата, (?P<name>...) именованные
  • Lookahead/lookbehind -- утверждения нулевой ширины для контекстного поиска
  • Всегда используйте raw strings (r'...') для регулярных выражений
  • re.compile() -- компиляция для повторного использования
  • re.VERBOSE -- читаемые регулярные выражения с комментариями

Проверь себя

🧪

Чем re.match() отличается от re.search()?

🧪

Что означает квантификатор *? (со знаком вопроса) в регулярных выражениях?

🧪

Что означает (?<=\$)\d+ в регулярном выражении?

🧪

Какой флаг нужен, чтобы точка (.) совпадала с символом новой строки?