Регулярные выражения
Регулярные выражения (regex) -- мощный инструмент для поиска, проверки и трансформации текста по шаблонам. В Python регулярные выражения доступны через модуль re.
Основы модуля re
import re
text = "Мой телефон: +7 (999) 123-45-67, рабочий: +7 (495) 987-65-43"
# re.search() - find first match
match = re.search(r'\d{3}-\d{2}-\d{2}', text)
if match:
print(match.group()) # 123-45-67
print(match.start()) # 23 (start index)
print(match.end()) # 32 (end index)
print(match.span()) # (23, 32)
# re.match() - match at the beginning of string only
print(re.match(r'\d+', "123abc")) # <Match: '123'>
print(re.match(r'\d+', "abc123")) # None (no match at start)
# re.fullmatch() - match entire string
print(re.fullmatch(r'\d+', "123")) # <Match: '123'>
print(re.fullmatch(r'\d+', "123abc")) # None
re.findall() и re.finditer()
import re
text = "Цены: 100 руб, 250 руб, 1500 руб, 42.50 руб"
# findall() returns a list of all matches
prices = re.findall(r'\d+\.?\d*', text)
print(prices) # ['100', '250', '1500', '42.50']
# finditer() returns an iterator of Match objects
for match in re.finditer(r'\d+\.?\d*', text):
print(f"Цена: {match.group()} на позиции {match.span()}")
# findall with groups - returns only captured groups
text = "Дата: 2026-03-30, обновлено: 2026-04-01"
dates = re.findall(r'(\d{4})-(\d{2})-(\d{2})', text)
print(dates) # [('2026', '03', '30'), ('2026', '04', '01')]
Синтаксис регулярных выражений
Символьные классы
import re
# . (dot) - any character except newline
print(re.findall(r'c.t', 'cat cot cut c\nt')) # ['cat', 'cot', 'cut']
# \d - digit [0-9], \D - non-digit
print(re.findall(r'\d+', 'abc123def456')) # ['123', '456']
# \w - word character [a-zA-Z0-9_], \W - non-word
print(re.findall(r'\w+', 'hello, world! 42')) # ['hello', 'world', '42']
# \s - whitespace, \S - non-whitespace
print(re.findall(r'\S+', 'hello world')) # ['hello', 'world']
# Custom character classes
print(re.findall(r'[aeiou]', 'hello world')) # ['e', 'o', 'o']
print(re.findall(r'[^aeiou\s]', 'hello')) # ['h', 'l', 'l'] (negation)
print(re.findall(r'[a-fA-F0-9]+', '0xFF 0x1A')) # ['0xFF', '0x1A']
# Russian letters
print(re.findall(r'[а-яА-ЯёЁ]+', 'Привет мир hello'))
# ['Привет', 'мир']
Квантификаторы
import re
# * (0 or more), + (1 or more), ? (0 or 1)
print(re.findall(r'ab*c', 'ac abc abbc')) # ['ac', 'abc', 'abbc']
print(re.findall(r'ab+c', 'ac abc abbc')) # ['abc', 'abbc']
print(re.findall(r'ab?c', 'ac abc abbc')) # ['ac', 'abc']
# {n} exactly n, {n,m} from n to m
print(re.findall(r'\d{3}', '1 12 123 1234')) # ['123', '123']
print(re.findall(r'\d{2,4}', '1 12 123 1234')) # ['12', '123', '1234']
# Greedy vs lazy (non-greedy)
html = '<b>bold</b> and <i>italic</i>'
print(re.findall(r'<.*>', html)) # ['<b>bold</b> and <i>italic</i>'] (greedy)
print(re.findall(r'<.*?>', html)) # ['<b>', '</b>', '<i>', '</i>'] (lazy)
print(re.findall(r'<.+?>', html)) # ['<b>', '</b>', '<i>', '</i>']
Якоря и границы
import re
# ^ start of string, $ end of string
print(re.findall(r'^\w+', 'hello world')) # ['hello']
print(re.findall(r'\w+$', 'hello world')) # ['world']
# \b word boundary
text = "cat category concatenate"
print(re.findall(r'\bcat\b', text)) # ['cat'] (exact word)
print(re.findall(r'\bcat\w*', text)) # ['cat', 'category', 'concatenate']
# Multiline mode
multiline = "first\nsecond\nthird"
print(re.findall(r'^\w+', multiline, re.MULTILINE))
# ['first', 'second', 'third']
Группы
import re
# Numbered groups
match = re.search(r'(\d{4})-(\d{2})-(\d{2})', '2026-03-30')
if match:
print(match.group(0)) # '2026-03-30' (full match)
print(match.group(1)) # '2026' (first group)
print(match.group(2)) # '03'
print(match.group(3)) # '30'
print(match.groups()) # ('2026', '03', '30')
# Named groups (?P<name>...)
pattern = r'(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})'
match = re.search(pattern, '2026-03-30')
if match:
print(match.group('year')) # '2026'
print(match.group('month')) # '03'
print(match.groupdict()) # {'year': '2026', 'month': '03', 'day': '30'}
# Non-capturing group (?:...)
# Useful when you need grouping but don't need to capture
emails = "[email protected] [email protected]"
pattern = r'\w+@(?:gmail|yahoo)\.com'
print(re.findall(pattern, emails)) # ['[email protected]', '[email protected]']
# Note: findall returns full match when no capturing groups
# Backreferences \1
# Find repeated words
text = "the the cat sat on on the mat"
pattern = r'\b(\w+)\s+\1\b'
print(re.findall(pattern, text)) # ['the', 'on']
# Named backreference
pattern = r'\b(?P<word>\w+)\s+(?P=word)\b'
print(re.findall(pattern, text)) # ['the', 'on']
Альтернация (|)
import re
# Match either pattern
text = "cat dog bird cat fish"
print(re.findall(r'cat|dog', text)) # ['cat', 'dog', 'cat']
# Group alternatives
text = "gray grey"
print(re.findall(r'gr[ae]y', text)) # ['gray', 'grey']
print(re.findall(r'gr(?:a|e)y', text)) # ['gray', 'grey'] (same)
# Complex alternatives
pattern = r'(?:https?|ftp)://\S+'
urls = "Visit https://python.org or ftp://files.example.com"
print(re.findall(pattern, urls))
# ['https://python.org', 'ftp://files.example.com']
Lookahead и Lookbehind
Утверждения нулевой ширины -- проверяют контекст, не включая его в совпадение:
import re
# Positive lookahead (?=...)
# Find word followed by a number
text = "item1 item2 thing item3 stuff"
print(re.findall(r'\w+(?=\d)', text)) # ['item', 'item', 'item']
# Negative lookahead (?!...)
# Find "item" NOT followed by a digit
text = "item1 item item3 items"
print(re.findall(r'item(?!\d)', text)) # ['item', 'item']
# Positive lookbehind (?<=...)
# Find numbers preceded by $
text = "Price: $100, Cost: €200, Total: $350"
print(re.findall(r'(?<=\$)\d+', text)) # ['100', '350']
# Negative lookbehind (?<!...)
# Find numbers NOT preceded by $
print(re.findall(r'(?<!\$)\d+', text)) # ['200']
# Practical: password validation
def validate_password(password: str) -> bool:
"""Check password has >= 8 chars, uppercase, lowercase, digit."""
pattern = r'^(?=.*[a-z])(?=.*[A-Z])(?=.*\d).{8,}$'
return bool(re.match(pattern, password))
print(validate_password("Abc12345")) # True
print(validate_password("abc12345")) # False (no uppercase)
print(validate_password("Short1")) # False (< 8 chars)
re.sub() и замена
import re
# Basic replacement
text = "Дата: 30-03-2026"
result = re.sub(r'(\d{2})-(\d{2})-(\d{4})', r'\3-\2-\1', text)
print(result) # Дата: 2026-03-30
# Named group replacement
result = re.sub(
r'(?P<day>\d{2})-(?P<month>\d{2})-(?P<year>\d{4})',
r'\g<year>/\g<month>/\g<day>',
text
)
print(result) # Дата: 2026/03/30
# Replacement with a function
def censor_email(match):
"""Replace email with censored version."""
local, domain = match.group(1), match.group(2)
return f"{local[0]}***@{domain}"
text = "Контакты: [email protected] и [email protected]"
result = re.sub(r'(\w+)@(\w+\.\w+)', censor_email, text)
print(result) # Контакты: i***@mail.ru и m***@gmail.com
# re.subn() - returns count of replacements
text = "aaa bbb aaa"
result, count = re.subn(r'aaa', 'xxx', text)
print(f"{result} ({count} замен)") # xxx bbb xxx (2 замен)
# Limit replacements
print(re.sub(r'\d', 'X', 'a1b2c3', count=2)) # aXbXc3
Компиляция и флаги
import re
# Compile pattern for reuse (faster for repeated use)
phone_pattern = re.compile(r'\+?\d[\d\s\-()]{7,}\d')
email_pattern = re.compile(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}')
text = "Звоните: +7 (999) 123-45-67 или пишите: [email protected]"
print(phone_pattern.findall(text)) # ['+7 (999) 123-45-67']
print(email_pattern.findall(text)) # ['[email protected]']
# Flags
# re.IGNORECASE (re.I) - case-insensitive
print(re.findall(r'python', 'Python PYTHON python', re.I))
# ['Python', 'PYTHON', 'python']
# re.MULTILINE (re.M) - ^ and $ match line boundaries
text = "first line\nsecond line\nthird line"
print(re.findall(r'^\w+', text, re.M)) # ['first', 'second', 'third']
# re.DOTALL (re.S) - . matches newline too
text = "<div>\nContent\n</div>"
print(re.findall(r'<div>.*?</div>', text, re.S))
# ['<div>\nContent\n</div>']
# re.VERBOSE (re.X) - allow comments and whitespace
pattern = re.compile(r"""
(?P<protocol>https?://) # Protocol (http or https)
(?P<domain>[^/\s]+) # Domain name
(?P<path>/\S*)? # Optional path
""", re.VERBOSE)
match = pattern.search("Visit https://python.org/docs for details")
if match:
print(match.groupdict())
# {'protocol': 'https://', 'domain': 'python.org', 'path': '/docs'}
# Combine flags
pattern = re.compile(r'hello', re.I | re.M)
Практические примеры
import re
# Validate email
def is_valid_email(email: str) -> bool:
pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
return bool(re.match(pattern, email))
# Extract all URLs
def extract_urls(text: str) -> list[str]:
pattern = r'https?://[^\s<>"{}|\\^`\[\]]+'
return re.findall(pattern, text)
# Clean and normalize whitespace
def normalize_whitespace(text: str) -> str:
return re.sub(r'\s+', ' ', text).strip()
print(normalize_whitespace(" too many spaces "))
# 'too many spaces'
# Parse log entries
log = "2026-03-30 14:30:45 [ERROR] Connection timeout for host 10.0.0.1"
pattern = r'(?P<date>\S+) (?P<time>\S+) \[(?P<level>\w+)\] (?P<message>.+)'
match = re.match(pattern, log)
if match:
print(match.groupdict())
# {'date': '2026-03-30', 'time': '14:30:45',
# 'level': 'ERROR', 'message': 'Connection timeout for host 10.0.0.1'}
# Convert camelCase to snake_case
def camel_to_snake(name: str) -> str:
s = re.sub(r'(?<!^)(?=[A-Z])', '_', name)
return s.lower()
print(camel_to_snake("camelCaseVariable")) # camel_case_variable
print(camel_to_snake("HTMLParser")) # h_t_m_l_parser
Итоги
re.search()-- первое совпадение в любом месте строкиre.match()-- совпадение только в начале строкиre.findall()-- все совпадения,re.finditer()-- итератор Match-объектовre.sub()-- замена с поддержкой групп и функций- Группы:
(...)захват,(?:...)без захвата,(?P<name>...)именованные - Lookahead/lookbehind -- утверждения нулевой ширины для контекстного поиска
- Всегда используйте raw strings (
r'...') для регулярных выражений re.compile()-- компиляция для повторного использованияre.VERBOSE-- читаемые регулярные выражения с комментариями