正则表达式是处理字符串的强力工具,它使用特定的语法规则来描述字符串的模式,可以用于验证、搜索、替换和提取字符串内容。
正则表达式基础语法
1. 基本匹配规则
import re
def basic_patterns():
"""基本匹配模式"""
# 精确匹配
pattern1 = r'hello'
text1 = 'hello world'
match1 = re.search(pattern1, text1)
print(f"精确匹配 '{pattern1}' 在 '{text1}': {match1.group() if match1 else '不匹配'}")
# 数字匹配
pattern2 = r'\d{3}' # 匹配3个数字
text2 = '我的电话是123,你的电话是456'
matches2 = re.findall(pattern2, text2)
print(f"数字匹配 '{pattern2}' 在 '{text2}': {matches2}")
# 字母数字匹配
pattern3 = r'\w+' # 匹配一个或多个字母数字
text3 = 'user_name123 和 test-var'
matches3 = re.findall(pattern3, text3)
print(f"字母数字匹配 '{pattern3}' 在 '{text3}': {matches3}")
# 任意字符匹配
pattern4 = r'py.' # 匹配py开头加任意字符
text4 = 'pyc, pyo, py!, python'
matches4 = re.findall(pattern4, text4)
print(f"任意字符匹配 '{pattern4}' 在 '{text4}': {matches4}")
def quantifiers():
"""量词使用"""
texts = [
'a', 'ab', 'abc', 'abcd',
'a b c', 'a b c',
'100', '1000', '10000'
]
patterns = [
(r'a?', '0或1个a'),
(r'a*', '0或多个a'),
(r'a+', '1或多个a'),
(r'a{2}', '2个a'),
(r'a{2,3}', '2-3个a'),
(r'\s+', '1或多个空格'),
(r'\d{3,5}', '3-5个数字')
]
for pattern, description in patterns:
print(f"\n模式 '{pattern}' ({description}):")
for text in texts:
matches = re.findall(pattern, text)
if matches:
print(f" '{text}' -> {matches}")
if __name__ == "__main__":
basic_patterns()
quantifiers()
2. 字符集和范围
import re
def character_sets():
"""字符集和范围"""
# 字符集示例
texts = [
'abc', '123', 'a1b2', 'A_B',
'hello-world', 'test@example.com'
]
patterns = [
(r'[abc]', '匹配a,b,c任意字符'),
(r'[0-9]', '匹配数字'),
(r'[a-z]', '匹配小写字母'),
(r'[A-Za-z]', '匹配所有字母'),
(r'[0-9a-zA-Z_]', '匹配字母数字下划线'),
(r'[^0-9]', '匹配非数字字符'),
(r'[\w\-]', '匹配字母数字下划线和横线')
]
for pattern, description in patterns:
print(f"\n模式 '{pattern}' ({description}):")
for text in texts:
matches = re.findall(pattern, text)
if matches:
print(f" '{text}' -> {matches}")
def word_and_boundary():
"""单词和边界匹配"""
texts = [
'hello world', 'helloworld',
'the theme', 'them',
'cat category catfish'
]
patterns = [
(r'\bhello\b', '完整单词hello'),
(r'\bthe\b', '完整单词the'),
(r'\bcat\b', '完整单词cat'),
(r'^hello', '行首的hello'),
(r'world$', '行尾的world')
]
for pattern, description in patterns:
print(f"\n模式 '{pattern}' ({description}):")
for text in texts:
match = re.search(pattern, text)
if match:
print(f" '{text}' -> 匹配位置: {match.span()}")
def alternation():
"""选择匹配"""
texts = [
'python', 'Python', 'PYTHON',
'java', 'javascript', 'javac'
]
patterns = [
(r'(P|p)ython', 'Python或python'),
(r'java(script)?', 'java或javascript'),
(r'(java|python)', 'java或python')
]
for pattern, description in patterns:
print(f"\n模式 '{pattern}' ({description}):")
for text in texts:
match = re.search(pattern, text)
if match:
print(f" '{text}' -> 匹配: {match.group()}")
if __name__ == "__main__":
character_sets()
word_and_boundary()
alternation()
re 模块核心功能
1. 匹配和搜索
import re
def match_vs_search():
"""match() vs search() 区别"""
text = 'hello world, welcome to python programming'
patterns = [
r'hello',
r'world',
r'python',
r'programming'
]
print("=== match() 方法 ===")
for pattern in patterns:
# match() 从字符串开头匹配
match_obj = re.match(pattern, text)
if match_obj:
print(f"'{pattern}' 匹配: {match_obj.group()} (位置: {match_obj.span()})")
else:
print(f"'{pattern}' 不匹配")
print("\n=== search() 方法 ===")
for pattern in patterns:
# search() 搜索整个字符串
match_obj = re.search(pattern, text)
if match_obj:
print(f"'{pattern}' 找到: {match_obj.group()} (位置: {match_obj.span()})")
def findall_finditer():
"""findall() 和 finditer() 使用"""
text = '我的电话是123-4567,工作电话是890-1234,紧急联系是555-6789'
# 电话号码模式
phone_pattern = r'\d{3}-\d{4}'
print("=== findall() 返回匹配字符串列表 ===")
phones = re.findall(phone_pattern, text)
print(f"找到的电话号码: {phones}")
print("\n=== finditer() 返回匹配对象迭代器 ===")
phone_matches = re.finditer(phone_pattern, text)
for i, match in enumerate(phone_matches, 1):
print(f"电话 {i}: {match.group()} (位置: {match.span()})")
def fullmatch_demo():
"""fullmatch() 完全匹配"""
test_cases = [
('123-4567', r'\d{3}-\d{4}'),
('123-456', r'\d{3}-\d{4}'),
('abc123', r'[a-z]+\d{3}'),
('abc12', r'[a-z]+\d{3}')
]
print("=== fullmatch() 完全匹配 ===")
for text, pattern in test_cases:
match_obj = re.fullmatch(pattern, text)
if match_obj:
print(f"'{text}' 完全匹配 '{pattern}': {match_obj.group()}")
else:
print(f"'{text}' 不完全匹配 '{pattern}'")
if __name__ == "__main__":
match_vs_search()
findall_finditer()
fullmatch_demo()
2. 字符串分割和替换
import re
def split_demo():
"""字符串分割"""
test_cases = [
('a b c', r'\s+', '分割空格'),
('a,b, c d', r'[\s,]+', '分割空格和逗号'),
('a,b;; c d', r'[\s,;]+', '分割空格、逗号和分号'),
('2023-08-15', r'-', '按横线分割日期'),
('name=John&age=30&city=NY', r'[&=]', '分割URL参数')
]
print("=== 正则表达式分割字符串 ===")
for text, pattern, description in test_cases:
result = re.split(pattern, text)
print(f"{description}: '{text}' -> {result}")
def sub_demo():
"""字符串替换"""
text = '今天是2023-08-15,会议在10:30开始,联系电话:123-456-7890'
replacements = [
(r'\d{4}-\d{2}-\d{2}', 'XXXX-XX-XX', '替换日期'),
(r'\d{2}:\d{2}', 'XX:XX', '替换时间'),
(r'\d{3}-\d{3}-\d{4}', 'XXX-XXX-XXXX', '替换电话号码'),
(r'\d+', '#', '替换所有数字')
]
print("=== 正则表达式替换 ===")
print(f"原始文本: {text}\n")
for pattern, repl, description in replacements:
result = re.sub(pattern, repl, text)
print(f"{description}: {result}")
def sub_with_callback():
"""使用回调函数进行替换"""
text = '价格分别是 $10.50, $20.00, 和 $15.75'
def convert_price(match):
"""将价格转换为人民币"""
price_usd = float(match.group(1))
price_rmb = price_usd * 7.2 # 假设汇率 1:7.2
return f'¥{price_rmb:.2f}'
# 匹配 $数字.数字 格式的价格
pattern = r'\$(\d+\.\d{2})'
result = re.sub(pattern, convert_price, text)
print(f"原始: {text}")
print(f"转换: {result}")
def subn_demo():
"""subn() - 返回替换次数"""
text = '苹果 香蕉 苹果 橙子 苹果 葡萄'
# 替换所有"苹果"为"水果",并返回替换次数
result, count = re.subn(r'苹果', '水果', text)
print(f"原始文本: {text}")
print(f"替换结果: {result}")
print(f"替换次数: {count}")
if __name__ == "__main__":
split_demo()
print("\n" + "="*50 + "\n")
sub_demo()
print("\n" + "="*50 + "\n")
sub_with_callback()
print("\n" + "="*50 + "\n")
subn_demo()
3. 分组提取
import re
def basic_groups():
"""基础分组"""
text = '联系电话:010-12345678'
# 提取区号和号码
pattern = r'(\d{3})-(\d{8})'
match = re.search(pattern, text)
if match:
print(f"完整匹配: {match.group(0)}")
print(f"区号: {match.group(1)}")
print(f"号码: {match.group(2)}")
print(f"所有分组: {match.groups()}")
print(f"分组字典: {match.groupdict()}")
def named_groups():
"""命名分组"""
text = '张三,30岁,住在北京市朝阳区'
# 使用命名分组
pattern = r'(?P<name>\w+),(?P<age>\d+)岁,住在(?P<address>.+)'
match = re.search(pattern, text)
if match:
print("=== 命名分组 ===")
print(f"完整匹配: {match.group(0)}")
print(f"姓名: {match.group('name')}")
print(f"年龄: {match.group('age')}")
print(f"地址: {match.group('address')}")
print(f"分组字典: {match.groupdict()}")
def email_parsing():
"""邮箱解析分组"""
emails = [
'user@example.com',
'john.doe@company.co.uk',
'invalid-email',
'name@domain'
]
# 邮箱解析模式
pattern = r'^([a-zA-Z0-9._%+-]+)@([a-zA-Z0-9.-]+)\.([a-zA-Z]{2,})$'
for email in emails:
match = re.match(pattern, email)
if match:
print(f"有效邮箱: {email}")
print(f" 用户名: {match.group(1)}")
print(f" 域名: {match.group(2)}")
print(f" 后缀: {match.group(3)}")
else:
print(f"无效邮箱: {email}")
def time_parsing():
"""时间解析分组"""
times = [
'19:05:30',
'9:5:3',
'23:59:59',
'24:00:00' # 无效时间
]
# 时间解析模式
pattern = r'^([01]?[0-9]|2[0-3]):([0-5]?[0-9]):([0-5]?[0-9])$'
for time_str in times:
match = re.match(pattern, time_str)
if match:
hours, minutes, seconds = match.groups()
print(f"有效时间: {time_str}")
print(f" 时: {hours}, 分: {minutes}, 秒: {seconds}")
else:
print(f"无效时间: {time_str}")
def nested_groups():
"""嵌套分组"""
text = '<div class="header">标题</div>'
# 嵌套分组提取HTML标签
pattern = r'<(\w+)(?:\s+([^>]*))?>([^<]*)</\1>'
match = re.search(pattern, text)
if match:
print("=== HTML标签解析 ===")
print(f"完整匹配: {match.group(0)}")
print(f"标签名: {match.group(1)}")
print(f"属性: {match.group(2)}")
print(f"内容: {match.group(3)}")
print(f"所有分组: {match.groups()}")
if __name__ == "__main__":
basic_groups()
print("\n" + "="*50 + "\n")
named_groups()
print("\n" + "="*50 + "\n")
email_parsing()
print("\n" + "="*50 + "\n")
time_parsing()
print("\n" + "="*50 + "\n")
nested_groups()
4. 贪婪 vs 非贪婪匹配
import re
def greedy_vs_lazy():
"""贪婪匹配 vs 非贪婪匹配"""
text = '<div>内容1</div><div>内容2</div>'
print("原始文本:", text)
# 贪婪匹配
greedy_pattern = r'<div>.*</div>'
greedy_match = re.search(greedy_pattern, text)
print(f"\n贪婪匹配 '{greedy_pattern}':")
print(f"匹配结果: {greedy_match.group() if greedy_match else '无匹配'}")
# 非贪婪匹配
lazy_pattern = r'<div>.*?</div>'
lazy_matches = re.findall(lazy_pattern, text)
print(f"\n非贪婪匹配 '{lazy_pattern}':")
print(f"匹配结果: {lazy_matches}")
def number_example():
"""数字匹配的贪婪问题"""
text = '102300'
print(f"原始数字: {text}")
# 贪婪匹配 - 问题
greedy_pattern = r'^(\d+)(0*)$'
greedy_match = re.match(greedy_pattern, text)
if greedy_match:
print(f"\n贪婪匹配 '{greedy_pattern}':")
print(f"分组1: {greedy_match.group(1)}")
print(f"分组2: {greedy_match.group(2)}")
# 非贪婪匹配 - 解决方案
lazy_pattern = r'^(\d+?)(0*)$'
lazy_match = re.match(lazy_pattern, text)
if lazy_match:
print(f"\n非贪婪匹配 '{lazy_pattern}':")
print(f"分组1: {lazy_match.group(1)}")
print(f"分组2: {lazy_match.group(2)}")
def html_extraction():
"""HTML内容提取示例"""
html_text = '''
<div class="article">
<h1>标题</h1>
<p>第一段内容</p>
<p>第二段内容</p>
</div>
<div class="comments">
<p>评论1</p>
<p>评论2</p>
</div>
'''
print("=== HTML内容提取 ===")
# 错误的贪婪匹配
print("\n1. 贪婪匹配 (错误):")
greedy_pattern = r'<div.*>(.*)</div>'
greedy_matches = re.findall(greedy_pattern, html_text, re.DOTALL)
for i, match in enumerate(greedy_matches, 1):
print(f"匹配 {i}: {repr(match[:50])}...")
# 正确的非贪婪匹配
print("\n2. 非贪婪匹配 (正确):")
lazy_pattern = r'<div.*?>(.*?)</div>'
lazy_matches = re.findall(lazy_pattern, html_text, re.DOTALL)
for i, match in enumerate(lazy_matches, 1):
print(f"匹配 {i}: {repr(match[:50])}...")
# 更精确的匹配
print("\n3. 精确匹配:")
precise_pattern = r'<div class="article">(.*?)</div>'
precise_match = re.search(precise_pattern, html_text, re.DOTALL)
if precise_match:
print(f"文章内容: {repr(precise_match.group(1)[:100])}...")
if __name__ == "__main__":
greedy_vs_lazy()
print("\n" + "="*50 + "\n")
number_example()
print("\n" + "="*50 + "\n")
html_extraction()
5. 预编译和性能优化
import re
import time
def compile_demo():
"""预编译正则表达式"""
# 需要多次使用的模式
email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
phone_pattern = r'^\d{3}-\d{3,8}$'
# 预编译
email_re = re.compile(email_pattern)
phone_re = re.compile(phone_pattern)
test_emails = [
'user@example.com',
'invalid.email',
'name@domain.org',
'test@sub.domain.com'
]
test_phones = [
'010-12345',
'123-456789',
'invalid-phone',
'020-8086'
]
print("=== 邮箱验证 ===")
for email in test_emails:
if email_re.match(email):
print(f"有效邮箱: {email}")
else:
print(f"无效邮箱: {email}")
print("\n=== 电话验证 ===")
for phone in test_phones:
if phone_re.match(phone):
print(f"有效电话: {phone}")
else:
print(f"无效电话: {phone}")
def performance_comparison():
"""性能比较:预编译 vs 直接使用"""
text = '这是一个测试文本,包含很多需要匹配的内容。' * 1000
pattern = r'测试|匹配|内容'
# 测试直接使用
start_time = time.time()
for _ in range(1000):
re.findall(pattern, text)
direct_time = time.time() - start_time
# 测试预编译
compiled_re = re.compile(pattern)
start_time = time.time()
for _ in range(1000):
compiled_re.findall(text)
compiled_time = time.time() - start_time
print("=== 性能比较 ===")
print(f"直接使用耗时: {direct_time:.4f}秒")
print(f"预编译使用耗时: {compiled_time:.4f}秒")
print(f"性能提升: {direct_time/compiled_time:.2f}倍")
def flags_demo():
"""标志位使用"""
text = '''Hello WORLD
this is a MULTI-line
TEXT example'''
patterns = [
(r'^hello', '默认标志 - 不匹配'),
(r'^hello', re.IGNORECASE, '忽略大小写 - 匹配'),
(r'^hello', re.IGNORECASE | re.MULTILINE, '多行模式 - 匹配每行开头'),
(r'.+', re.DOTALL, '点号匹配所有字符(包括换行)'),
(r'^[a-z ]+$', re.IGNORECASE | re.MULTILINE, '多行+忽略大小写')
]
print("原始文本:")
print(text)
print("\n=== 标志位演示 ===")
for pattern_info in patterns:
if len(pattern_info) == 3:
pattern, flags, description = pattern_info
matches = re.findall(pattern, text, flags=flags)
else:
pattern, description = pattern_info
matches = re.findall(pattern, text)
print(f"\n{description}: '{pattern}'")
print(f"匹配结果: {matches}")
if __name__ == "__main__":
compile_demo()
print("\n" + "="*50 + "\n")
performance_comparison()
print("\n" + "="*50 + "\n")
flags_demo()
实践练习
1. 邮箱验证
import re
def is_valid_email(addr):
"""版本一:验证邮箱是否合法"""
# 邮箱正则表达式
pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
return bool(re.match(pattern, addr))
def test_email_validation():
"""测试邮箱验证"""
test_emails = [
'someone@gmail.com',
'bill.gates@microsoft.com',
'bob#example.com',
'mr-bob@example.com',
'test.user+tag@sub.domain.co.uk',
'invalid.email',
'name@domain',
'@domain.com',
'user@.com'
]
print("=== 邮箱验证测试 ===")
for email in test_emails:
is_valid = is_valid_email(email)
status = "有效" if is_valid else "无效"
print(f"{email:30} -> {status}")
def name_of_email(addr):
"""版本二:提取邮箱名字"""
# 模式1: <名字> 邮箱地址
pattern1 = r'^<([^>]+)>\s*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
# 模式2: 直接邮箱地址
pattern2 = r'^([a-zA-Z0-9._%+-]+)@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
match1 = re.match(pattern1, addr)
if match1:
return match1.group(1)
match2 = re.match(pattern2, addr)
if match2:
return match2.group(1)
return None
def test_name_extraction():
"""测试名字提取"""
test_cases = [
'<Tom Paris> tom@voyager.org',
'tom@voyager.org',
'<John Doe> john.doe@example.com',
'alice.smith@company.co.uk',
'invalid-email'
]
print("\n=== 邮箱名字提取测试 ===")
for case in test_cases:
name = name_of_email(case)
print(f"{case:35} -> {name}")
if __name__ == "__main__":
test_email_validation()
test_name_extraction()
# 验证练习要求
print("\n=== 练习验证 ===")
assert is_valid_email('someone@gmail.com')
assert is_valid_email('bill.gates@microsoft.com')
assert not is_valid_email('bob#example.com')
assert not is_valid_email('mr-bob@example.com')
print('邮箱验证测试通过!')
assert name_of_email('<Tom Paris> tom@voyager.org') == 'Tom Paris'
assert name_of_email('tom@voyager.org') == 'tom'
print('名字提取测试通过!')
2. 实用正则表达式示例
import re
def practical_examples():
"""实用正则表达式示例"""
# 1. URL提取
print("=== URL提取 ===")
text1 = "访问 https://www.example.com 和 http://sub.domain.org/path?query=1"
url_pattern = r'https?://[^\s]+'
urls = re.findall(url_pattern, text1)
print(f"文本: {text1}")
print(f"提取的URL: {urls}")
# 2. 中文提取
print("\n=== 中文提取 ===")
text2 = "Hello 世界!这是中文文本。English words here."
chinese_pattern = r'[\u4e00-\u9fff]+'
chinese_words = re.findall(chinese_pattern, text2)
print(f"文本: {text2}")
print(f"中文词汇: {chinese_words}")
# 3. 密码强度验证
print("\n=== 密码强度验证 ===")
def check_password_strength(password):
# 至少8位,包含大小写字母和数字
pattern = r'^(?=.*[a-z])(?=.*[A-Z])(?=.*\d).{8,}$'
return bool(re.match(pattern, password))
passwords = ['weak', 'Weak1', 'Strong123', 'VERYstrong123']
for pwd in passwords:
strength = "强" if check_password_strength(pwd) else "弱"
print(f"密码 '{pwd}': {strength}")
# 4. HTML标签清理
print("\n=== HTML标签清理 ===")
html_text = '<div class="test">Hello <b>World</b>!</div>'
clean_text = re.sub(r'<[^>]+>', '', html_text)
print(f"原始HTML: {html_text}")
print(f"清理后: {clean_text}")
# 5. 身份证号验证
print("\n=== 身份证号验证 ===")
def validate_id_card(id_card):
# 简单的身份证格式验证
pattern = r'^[1-9]\d{5}(18|19|20)\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])\d{3}[\dXx]$'
return bool(re.match(pattern, id_card))
id_cards = ['11010119900307567X', '123456', '510101198501011234']
for id_card in id_cards:
valid = "有效" if validate_id_card(id_card) else "无效"
print(f"身份证 {id_card}: {valid}")
def advanced_email_parser():
"""高级邮箱解析"""
def parse_email(email):
"""解析邮箱的各个部分"""
pattern = r'''
^
(?P<local>[a-zA-Z0-9._%+-]+) # 本地部分
@
(?P<domain>[a-zA-Z0-9.-]+) # 域名
\.
(?P<tld>[a-zA-Z]{2,}) # 顶级域名
$
'''
match = re.match(pattern, email, re.VERBOSE)
if match:
return match.groupdict()
return None
emails = [
'user.name+tag@sub.example.co.uk',
'test@localhost',
'invalid.email@com'
]
print("=== 高级邮箱解析 ===")
for email in emails:
parts = parse_email(email)
if parts:
print(f"邮箱: {email}")
print(f" 本地部分: {parts['local']}")
print(f" 域名: {parts['domain']}")
print(f" 顶级域名: {parts['tld']}")
else:
print(f"无效邮箱: {email}")
if __name__ == "__main__":
practical_examples()
print("\n" + "="*50 + "\n")
advanced_email_parser()
总结
正则表达式核心要点:
-
基本语法:
-
\d- 数字,\w- 字母数字,\s- 空白字符 -
.- 任意字符,*- 0或多个,+- 1或多个,?- 0或1个 -
{n}- n个,{n,m}- n到m个 -
字符集:
-
[abc]- 匹配a,b,c -
[a-z]- 匹配小写字母 -
[^0-9]- 匹配非数字 -
分组和引用:
-
()- 捕获分组 -
(?:)- 非捕获分组 -
(?P<name>)- 命名分组 -
边界匹配:
-
^- 行首,$- 行尾 -
\b- 单词边界 -
贪婪与非贪婪:
-
默认贪婪: .* -
非贪婪: .*? -
re模块方法:
-
match()- 从头匹配 -
search()- 搜索匹配 -
findall()- 查找所有 -
sub()- 替换 -
split()- 分割 -
性能优化:
-
预编译常用模式 -
使用合适的标志位 -
避免过度复杂的正则表达式
正则表达式是处理文本的强大工具,熟练掌握可以大大提高开发效率。建议在实践中不断练习,逐步掌握各种复杂模式的使用。

