字符串是 Python 中最常用的数据类型,但 90% 的程序员只用了 20% 的字符串操作。本文会介绍 20 个高频、高效的字符串操作技巧,掌握它们,你的代码效率能提升 10 倍。
一、字符串基础查找和替换(5 个)
1. find() vs index() —— 查找子字符串的位置
这两个方法看起来功能相同,其实有关键区别。
text = "Python is awesome, Python is powerful"
# find():找到返回索引,找不到返回 -1(不报错)
pos1 = text.find("Python")
print(pos1) # 输出:0
pos2 = text.find("Java")
print(pos2) # 输出:-1(不存在,返回 -1)
# index():找到返回索引,找不到报错
pos3 = text.index("Python")
print(pos3) # 输出:0
pos4 = text.index("Java")
# 报错:ValueError: substring not found
# 找第二次出现的位置
second_pos = text.find("Python", 1) # 从位置 1 开始查找
print(second_pos) # 输出:26
关键区别:
-
find()找不到返回 -1,比较安全 -
index()找不到报错,需要异常处理
最佳实践:推荐用 find(),避免异常处理的开销。
2. replace() —— 替换子字符串
text = "hello world, hello python"
# 基础替换:替换所有匹配
result1 = text.replace("hello", "hi")
print(result1)
# 输出:hi world, hi python
# 替换指定次数:只替换前 n 个
result2 = text.replace("hello", "hi", 1) # 只替换第 1 个
print(result2)
# 输出:hi world, hello python
# 大小写敏感
text2 = "Hello world, hello python"
result3 = text2.replace("hello", "hi")
print(result3)
# 输出:Hello world, hi python(第一个 H 没替换)
性能陷阱:
# ❌ 不好的做法(低效)
text = "a" * 1000000 # 100 万个 'a'
for i in range(100):
text = text.replace("a", "b") # 每次都遍历整个字符串
# ✅ 好的做法(高效)
import string
text = "a" * 1000000
result = text.translate(str.maketrans("a", "b")) # 一次遍历搞定
3. count() —— 统计子字符串出现次数
text = "the quick brown fox jumps over the lazy dog"
# 基础计数
count1 = text.count("the")
print(count1) # 输出:2
# 指定范围计数(从索引 5 到 30)
count2 = text.count("the", 5, 30)
print(count2) # 输出:1
# 统计不同字符的频率
stats = {}
for char in text:
if char != ' ':
stats[char] = stats.get(char, 0) + 1
print(stats)
# 输出:{'t': 2, 'h': 2, 'e': 3, ...}
# 更高效的做法:用 Counter
from collections import Counter
char_count = Counter(text.replace(" ", ""))
print(char_count.most_common(3)) # 输出最常见的 3 个字符
生产环境用法:
# 统计关键词在日志中出现的次数
log_text = """
ERROR: Database connection failed
WARNING: Memory usage high
ERROR: Timeout error
INFO: Server restarted
ERROR: Authentication failed
"""
error_count = log_text.count("ERROR")
warning_count = log_text.count("WARNING")
print(f"错误数:{error_count},警告数:{warning_count}")
4. startswith() 和 endswith() —— 前缀和后缀判断
filename = "document.pdf"
url = "https://www.example.com"
# 检查后缀
if filename.endswith((".pdf", ".doc", ".docx")):
print("这是一个文档文件")
# 检查前缀
if url.startswith(("http://", "https://")):
print("这是一个网址")
# 实际应用:文件过滤
import os
def get_python_files(directory):
"""获取目录下所有 Python 文件"""
python_files = []
for file in os.listdir(directory):
if file.endswith('.py'):
python_files.append(file)
return python_files
# 更 Pythonic 的做法
def get_python_files_v2(directory):
"""更高效的版本"""
return [f for f in os.listdir(directory) if f.endswith('.py')]
性能对比:
# ❌ 不好的做法
if filename.endswith('.pdf') or filename.endswith('.doc'):
pass
# ✅ 好的做法(3 倍快)
if filename.endswith(('.pdf', '.doc')):
pass
5. strip() / lstrip() / rstrip() —— 去除空白字符
text = " hello world \n"
# strip():去除两端的空白
result1 = text.strip()
print(f"'{result1}'") # 输出:'hello world'
# lstrip():只去除左端
result2 = text.lstrip()
print(f"'{result2}'") # 输出:'hello world \n'
# rstrip():只去除右端
result3 = text.rstrip()
print(f"'{result3}'") # 输出:' hello world'
# ⚠️ 关键陷阱:不是只去除一个空格!
text2 = "---hello---"
print(text2.strip("-")) # 输出:hello(所有连续的 - 都被去除了)
# 自定义要去除的字符
text3 = "xxxhelloyyy"
print(text3.strip("xy")) # 输出:hello
print(text3.strip("xyhel")) # 输出:o(只要包含的字符都被去除)
# 实战应用:清理 CSV 数据
csv_line = " 张三 , 25 , 北京 \n"
fields = [f.strip() for f in csv_line.split(',')]
print(fields)
# 输出:['张三', '25', '北京']
# 处理用户输入
user_input = input("请输入你的名字:").strip()
# 自动去除多余的空白,避免数据不一致
常见错误:
# ❌ 错误:strip 去除的是字符的集合,不是字符串本身
text = "hello"
print(text.strip("lo")) # 输出:he(不是 hello)
# ✅ 正确做法:如果要去除某个字符串前缀
if text.startswith("lo"):
text = text[2:]
二、高级分割和连接(4 个)
6. split() 和 rsplit() —— 分割字符串的艺术
# 基础分割
text = "apple,banana,cherry,date"
parts1 = text.split(",")
print(parts1)
# 输出:['apple', 'banana', 'cherry', 'date']
# 限制分割次数
parts2 = text.split(",", 2) # 只分割 2 次
print(parts2)
# 输出:['apple', 'banana', 'cherry,date']
# rsplit():从右边开始分割
parts3 = text.rsplit(",", 2) # 从右边分割 2 次
print(parts3)
# 输出:['apple,banana', 'cherry', 'date']
# 多个分隔符分割(用正则表达式)
import re
text2 = "apple, banana; cherry: date"
parts4 = re.split(r'[,;:]', text2)
print(parts4)
# 输出:['apple', ' banana', ' cherry', ' date']
# 实战应用1:解析 URL
url = "https://www.example.com/path/to/resource?key=value&foo=bar"
protocol, rest = url.split("://", 1)
domain, rest = rest.split("/", 1)
path, query = rest.split("?", 1)
print(f"协议:{protocol},域名:{domain},路径:{path},查询:{query}")
# 实战应用2:解析 CSV 行
csv_line = 'John,"Smith, Jr.",30,New York'
# 简单 split 会出错,需要用 csv 模块
import csv
reader = csv.reader([csv_line])
fields = next(reader)
print(fields)
# 输出:['John', 'Smith, Jr.', '30', 'New York']
性能对比:
# ❌ 低效:多次分割
text = "a:b:c:d:e"
parts = text.split(":")
result = parts[2] # 获取第 3 个元素
# ✅ 高效:只分割需要的部分
result = text.split(":", 3)[2]
7. join() —— 连接字符串
# 基础连接
words = ["hello", "world", "python"]
result1 = " ".join(words)
print(result1) # 输出:hello world python
# 连接数字(需要转换)
numbers = [1, 2, 3, 4, 5]
result2 = "-".join(str(n) for n in numbers)
print(result2) # 输出:1-2-3-4-5
# 实战应用1:生成 SQL IN 语句
ids = [1, 2, 3, 4, 5]
sql = f"SELECT * FROM users WHERE id IN ({','.join(map(str, ids))})"
print(sql)
# 实战应用2:生成 URL 路径
path_parts = ["api", "v1", "users", "123"]
path = "/" + "/".join(path_parts)
print(path) # 输出:/api/v1/users/123
# 实战应用3:生成 CSV 行
data = ["张三", 25, "北京", "zhangsan@example.com"]
csv_line = ",".join(map(str, data))
print(csv_line)
# ⚠️ 性能陷阱:不要用 + 连接多个字符串
# ❌ 不好的做法(每次都创建新字符串,O(n²) 复杂度)
result = ""
for word in words:
result = result + " " + word
# ✅ 好的做法(一次性连接,O(n) 复杂度)
result = " ".join(words)
大规模数据对比:
import time
# 生成 10000 个字符串
data = ["word"] * 10000
# 用 + 连接(耗时)
start = time.time()
result = ""
for word in data:
result += word + ","
time1 = time.time() - start
# 用 join(快速)
start = time.time()
result = ",".join(data)
time2 = time.time() - start
print(f"+ 方式:{time1:.4f}s,join 方式:{time2:.4f}s")
# 输出示例:+ 方式:0.1234s,join 方式:0.0012s(快 100 倍!)
8. partition() 和 rpartition() —— 三分法分割
# partition():在第一个分隔符处分割成三部分
text = "name=John;age=30;city=NYC"
head, sep, tail = text.partition(";")
print(f"前:{head},分隔符:{sep},后:{tail}")
# 输出:前:name=John,分隔符:;,后:age=30;city=NYC
# 实战应用:解析 key=value 格式
def parse_key_value(text):
key, sep, value = text.partition("=")
return key.strip(), value.strip() if sep elseNone
result = parse_key_value("timeout = 3000")
print(result) # 输出:('timeout', '3000')
# rpartition():从右边开始分割
head, sep, tail = text.rpartition(";")
print(f"前:{head},分隔符:{sep},后:{tail}")
# 输出:前:name=John;age=30,分隔符:;,后:city=NYC
# 实战应用:获取文件扩展名
def get_file_info(filename):
name, sep, ext = filename.rpartition(".")
return name, ext if sep else""
print(get_file_info("document.pdf")) # 输出:('document', 'pdf')
print(get_file_info("archive.tar.gz")) # 输出:('archive.tar', 'gz')
三、格式化和转换(5 个)
9. format() 和 f-string —— 字符串格式化的演进
name = "张三"
age = 25
salary = 15000.5
# 方式1:% 格式化(已过时)
result1 = "姓名:%s,年龄:%d,工资:%.2f" % (name, age, salary)
# 方式2:format() 方法(兼容性好)
result2 = "姓名:{},年龄:{},工资:{:.2f}".format(name, age, salary)
# 方式3:f-string(Python 3.6+,推荐)
result3 = f"姓名:{name},年龄:{age},工资:{salary:.2f}"
print(result3)
# 输出:姓名:张三,年龄:25,工资:15000.50
# f-string 的强大功能:可以直接执行表达式
print(f"下年工资:{salary * 1.1:.2f}") # 输出:下年工资:16500.55
# 对齐和填充
numbers = [1, 12, 123, 1234]
for num in numbers:
print(f"数字:{num:>5}")
# 输出:
# 数字: 1
# 数字: 12
# 数字: 123
# 数字: 1234
# 进制转换
num = 255
print(f"十进制:{num},十六进制:{num:x},二进制:{num:b}")
# 输出:十进制:255,十六进制:ff,二进制:11111111
# 百分比格式
rate = 0.8567
print(f"完成度:{rate:.2%}") # 输出:完成度:85.67%
# 数字分隔符(Python 3.6+)
large_num = 1234567890
print(f"大数字:{large_num:,}") # 输出:大数字:1,234,567,890
性能对比:
import time
name = "Python"
age = 10
# 对比三种方法的性能
iterations = 1000000
# % 格式化
start = time.time()
for _ in range(iterations):
result = "%s is %d years old" % (name, age)
time1 = time.time() - start
# format() 方法
start = time.time()
for _ in range(iterations):
result = "{} is {} years old".format(name, age)
time2 = time.time() - start
# f-string
start = time.time()
for _ in range(iterations):
result = f"{name} is {age} years old"
time3 = time.time() - start
print(f"% 格式化:{time1:.3f}s")
print(f"format():{time2:.3f}s")
print(f"f-string:{time3:.3f}s")
# 输出示例:f-string 最快,% 最慢
10. upper() / lower() / title() / swapcase() —— 大小写转换
text = "Hello World Python"
# 全部大写
print(text.upper()) # 输出:HELLO WORLD PYTHON
# 全部小写
print(text.lower()) # 输出:hello world python
# 标题格式(首字母大写)
print(text.title()) # 输出:Hello World Python
# 交换大小写
print(text.swapcase()) # 输出:hELLO wORLD pYTHON
# capitalize():首字母大写,其他小写
print(text.capitalize()) # 输出:Hello world python
# 实战应用1:规范化用户输入
user_email = input("请输入邮箱:").strip().lower()
# 防止大小写差异导致的问题
# 实战应用2:生成 URL slug
def slugify(text):
"""将文本转换为 URL 安全的格式"""
return text.lower().replace(" ", "-")
print(slugify("Hello World Python")) # 输出:hello-world-python
# 实战应用3:检查密码复杂度
def check_password_strength(password):
has_upper = any(c.isupper() for c in password)
has_lower = any(c.islower() for c in password)
has_digit = any(c.isdigit() for c in password)
return len(password) >= 8and has_upper and has_lower and has_digit
print(check_password_strength("Secure123")) # 输出:True
11. isdigit() / isalpha() / isalnum() —— 字符检验
# 检查是否全是数字
print("12345".isdigit()) # 输出:True
print("123a5".isdigit()) # 输出:False
# 检查是否全是字母
print("hello".isalpha()) # 输出:True
print("hello123".isalpha()) # 输出:False
# 检查是否全是字母或数字
print("hello123".isalnum()) # 输出:True
print("hello-123".isalnum()) # 输出:False
# 检查是否全是空格
print(" ".isspace()) # 输出:True
# 检查是否是合法标识符(变量名)
print("var_name".isidentifier()) # 输出:True
print("123var".isidentifier()) # 输出:False
# 检查是否全是大写/小写
print("HELLO".isupper()) # 输出:True
print("hello".islower()) # 输出:True
# 实战应用1:验证用户输入
def validate_username(username):
if len(username) < 3or len(username) > 20:
returnFalse, "用户名长度 3-20 位"
ifnot username[0].isalpha():
returnFalse, "用户名首字必须是字母"
ifnot username.replace("_", "").isalnum():
returnFalse, "用户名只能包含字母、数字和下划线"
returnTrue, "用户名合法"
print(validate_username("user_123")) # 输出:(True, '用户名合法')
print(validate_username("123user")) # 输出:(False, '用户名首字必须是字母')
# 实战应用2:数据类型识别
def detect_type(value_str):
"""识别字符串代表的数据类型"""
if value_str.isdigit():
return"整数"
elif value_str.isalpha():
return"字符串"
elif value_str.isalnum():
return"混合类型"
else:
return"其他"
print(detect_type("123")) # 输出:整数
12. zfill() 和 center() —— 填充和居中
# zfill():用 0 填充左边
num_str = "123"
print(num_str.zfill(5)) # 输出:00123
# 实战应用1:生成订单号
def generate_order_id(order_num):
returnf"ORD{order_num:0>6d}"
print(generate_order_id(123)) # 输出:ORD000123
# center():居中(填充两边)
text = "Python"
print(text.center(15)) # 输出:" Python "
print(text.center(15, "*")) # 输出:"****Python*****"
# ljust() 和 rjust():左对齐和右对齐
print(text.ljust(15, "-")) # 输出:Python---------
print(text.rjust(15, "-")) # 输出:---------Python
# 实战应用2:打印表格
def print_table(rows):
"""打印对齐的表格"""
for row in rows:
print("|".join(cell.center(15) for cell in row))
rows = [
["姓名", "年龄", "城市"],
["张三", "25", "北京"],
["李四", "30", "上海"],
]
print_table(rows)
四、正则表达式和高级操作(6 个)
13. 正则表达式基础 —— match() / search() / findall()
import re
# match():从开头匹配
text = "Python 3.9"
if re.match(r"Python", text):
print("匹配成功")
# search():在全文中查找
if re.search(r"\d+\.\d+", text):
print("找到版本号")
# findall():找出所有匹配
emails = "contact us at john@example.com or jane@test.org"
found = re.findall(r"\b[\w.-]+@[\w.-]+\.\w+\b", emails)
print(found)
# 输出:['john@example.com', 'jane@test.org']
# 提取有分组的内容
text = "Price: $99.99, Tax: $7.50"
matches = re.findall(r"\$(\d+\.\d+)", text)
print(matches)
# 输出:['99.99', '7.50']
# 实战应用1:电话号码提取
def extract_phone_numbers(text):
"""从文本中提取电话号码"""
pattern = r"\b(?:\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})\b"
return re.findall(pattern, text)
text = "Call me at 123-456-7890 or (098) 765 4321"
print(extract_phone_numbers(text))
# 实战应用2:URL 提取
def extract_urls(text):
"""从文本中提取所有 URL"""
pattern = r"https?://(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&//=]*)"
return re.findall(pattern, text)
text = "Visit https://www.example.com or http://test.org for more info"
print(extract_urls(text))
14. sub() 和 subn() —— 正则替换
import re
# sub():替换所有匹配
text = "The price is $99.99 and tax is $7.50"
result = re.sub(r"\$(\d+\.\d+)", r"¥\1*7", text)
print(result)
# 输出:The price is ¥99.99*7 and tax is ¥7.50*7
# subn():替换并返回替换次数
text = "apple, apple, apple"
result, count = re.subn(r"apple", "orange", text)
print(f"替换了 {count} 处")
print(result)
# 使用函数进行动态替换
def replace_func(match):
"""将价格增加 10%"""
price = float(match.group(1))
returnf"${price * 1.1:.2f}"
text = "Item 1: $100, Item 2: $50"
result = re.sub(r"\$(\d+(?:\.\d+)?)", replace_func, text)
print(result)
# 输出:Item 1: $110.00, Item 2: $55.00
# 实战应用1:日期格式转换
def convert_date_format(text):
"""将 2024-01-15 转换为 15/01/2024"""
pattern = r"(\d{4})-(\d{2})-(\d{2})"
return re.sub(pattern, r"\3/\2/\1", text)
print(convert_date_format("Today is 2024-01-15"))
# 输出:Today is 15/01/2024
# 实战应用2:HTML 标签移除
def remove_html_tags(text):
"""从 HTML 中提取纯文本"""
return re.sub(r"<[^>]+>", "", text)
html = "<p>Hello <b>World</b></p>"
print(remove_html_tags(html))
# 输出:Hello World
15. compile() —— 预编译正则表达式(性能优化)
import re
# ❌ 不好的做法(每次都编译)
def validate_email_slow(email):
for _ in range(1000):
if re.match(r"^[\w\.-]+@[\w\.-]+\.\w+$", email):
returnTrue
returnFalse
# ✅ 好的做法(只编译一次)
email_pattern = re.compile(r"^[\w\.-]+@[\w\.-]+\.\w+$")
def validate_email_fast(email):
for _ in range(1000):
if email_pattern.match(email):
returnTrue
returnFalse
# 性能对比
import time
email = "user@example.com"
start = time.time()
validate_email_slow(email)
time1 = time.time() - start
start = time.time()
validate_email_fast(email)
time2 = time.time() - start
print(f"未预编译:{time1:.4f}s,预编译:{time2:.4f}s")
# 预编译通常快 2-3 倍
# 实战应用:创建验证器类
class Validator:
"""使用预编译正则表达式的验证器"""
EMAIL_PATTERN = re.compile(r"^[\w\.-]+@[\w\.-]+\.\w+$")
PHONE_PATTERN = re.compile(r"^\d{10,11}$")
URL_PATTERN = re.compile(r"^https?://")
@classmethod
def is_valid_email(cls, email):
return cls.EMAIL_PATTERN.match(email) isnotNone
@classmethod
def is_valid_phone(cls, phone):
return cls.PHONE_PATTERN.match(phone) isnotNone
@classmethod
def is_valid_url(cls, url):
return cls.URL_PATTERN.match(url) isnotNone
print(Validator.is_valid_email("user@example.com")) # True
print(Validator.is_valid_phone("13800138000")) # True
print(Validator.is_valid_url("https://example.com")) # True
16. translate() —— 高效的字符替换
# 创建转换表
translation_table = str.maketrans("aeiou", "12345")
text = "hello world"
result = text.translate(translation_table)
print(result)
# 输出:h2ll4 w4rld
# 删除指定字符
delete_table = str.maketrans("", "", "aeiou")
text = "hello world"
result = text.translate(delete_table)
print(result)
# 输出:hll wrld
# 实战应用1:移除标点符号
import string
text = "Hello, World! How are you?"
remove_punctuation = str.maketrans("", "", string.punctuation)
result = text.translate(remove_punctuation)
print(result)
# 输出:Hello World How are you
# 实战应用2:数字转中文
chinese_map = str.maketrans("0123456789", "零一二三四五六七八九")
text = "My phone is 13800138000"
result = text.translate(chinese_map)
print(result)
# 输出:My phone is 一三八零零一三八零零零
# 性能对比:translate vs replace
import time
text = "hello world" * 10000
iterations = 10000
# 方式1:用 replace
start = time.time()
for _ in range(iterations):
result = text.replace("o", "0").replace("e", "3")
time1 = time.time() - start
# 方式2:用 translate
trans_table = str.maketrans("oe", "03")
start = time.time()
for _ in range(iterations):
result = text.translate(trans_table)
time2 = time.time() - start
print(f"replace 方式:{time1:.4f}s,translate 方式:{time2:.4f}s")
# translate 通常快 3-5 倍
17. expandtabs() —— 制表符处理
# 将制表符转换为空格
text = "name\tage\tcity\nJohn\t25\tNYC"
print(text.expandtabs(15))
# 输出对齐的表格
# 实战应用:处理日志文件中的缩进
log_text = "Error:\t\tConnection failed\nWarning:\t\tMemory high"
formatted = log_text.expandtabs(20)
print(formatted)
# 获取制表符的位置
text = "Line1\tColumn1\nLine2\tColumn2"
print(text.expandtabs(10))
18. encode() 和 decode() —— 字符编码转换
# 编码:字符串 → 字节
text = "Hello 世界 🌍"
# 编码为 UTF-8
encoded_utf8 = text.encode("utf-8")
print(encoded_utf8)
# 输出:b'Hello \xe4\xb8\x96\xe7\x95\x8c \xf0\x9f\x8c\x8d'
# 编码为 GB2312(简体中文)
encoded_gb = text.encode("gb2312", errors="ignore")
print(encoded_gb)
# 解码:字节 → 字符串
decoded = encoded_utf8.decode("utf-8")
print(decoded)
# 输出:Hello 世界 🌍
# 处理编码错误
text = "测试"
try:
# 尝试用 ASCII 编码(会失败)
encoded = text.encode("ascii")
except UnicodeEncodeError as e:
print(f"编码错误:{e}")
# 使用错误处理策略
# 'strict':遇到无法编码的字符报错(默认)
# 'ignore':忽略无法编码的字符
# 'replace':用 ? 替代无法编码的字符
# 'xmlcharrefreplace':用 XML 字符引用替代
text = "Hello 世界"
print(text.encode("ascii", errors="ignore"))
# 输出:b'Hello '
print(text.encode("ascii", errors="replace"))
# 输出:b'Hello ?'
print(text.encode("ascii", errors="xmlcharrefreplace"))
# 输出:b'Hello 世界'
# 实战应用1:处理文件编码问题
def safe_read_file(filepath):
"""安全读取文件,自动处理编码问题"""
encodings = ["utf-8", "gbk", "gb2312", "ascii"]
for encoding in encodings:
try:
with open(filepath, "r", encoding=encoding) as f:
return f.read()
except (UnicodeDecodeError, UnicodeEncodeError):
continue
raise ValueError("无法读取文件,编码未知")
# 实战应用2:处理网络数据
import json
json_str = '{"name":"张三","age":25}'
json_bytes = json_str.encode("utf-8")
decoded_str = json_bytes.decode("utf-8")
data = json.loads(decoded_str)
print(data)
19. ljust() / rjust() / center() 的高级用法
# 基础用法
text = "Python"
print(text.ljust(15, "-")) # 输出:Python---------
print(text.rjust(15, "-")) # 输出:---------Python
print(text.center(15, "-")) # 输出:----Python-----
# 实战应用1:创建进度条
def progress_bar(percent, width=20):
"""创建文本进度条"""
filled = int(width * percent / 100)
bar = "█" * filled + "░" * (width - filled)
returnf"[{bar}] {percent}%"
for i in range(0, 101, 10):
print(progress_bar(i))
# 实战应用2:对齐输出(类似表格)
def print_aligned_table(data):
"""打印对齐的表格"""
# 计算每列的最大宽度
max_widths = [max(len(str(row[i])) for row in data)
for i in range(len(data[0]))]
for row in data:
aligned_row = [str(cell).ljust(width)
for cell, width in zip(row, max_widths)]
print(" | ".join(aligned_row))
data = [
["姓名", "年龄", "城市"],
["张三", "25", "北京"],
["李四的昵称", "30", "上海"],
]
print_aligned_table(data)
# 实战应用3:美化日志输出
def format_log_message(level, message):
"""格式化日志消息"""
level_str = f"[{level}]".ljust(10)
returnf"{level_str} {message}"
print(format_log_message("INFO", "Server started"))
print(format_log_message("ERROR", "Connection failed"))
print(format_log_message("WARNING", "Memory usage high"))
20. casefold() —— 激进的大小写折叠
# casefold():比 lower() 更激进的小写转换
# 适用于国际字符和不同语言
text = "ß"# 德文字母
print(text.lower()) # 输出:ß(不变)
print(text.casefold()) # 输出:ss(转换为两个 s)
# 实战应用1:不区分大小写的字符串比较
def case_insensitive_compare(str1, str2):
"""不区分大小写的比较(包括国际字符)"""
return str1.casefold() == str2.casefold()
print(case_insensitive_compare("Straße", "STRASSE")) # 输出:True
print(case_insensitive_compare("hello", "HELLO")) # 输出:True
# 实战应用2:搜索功能
def search_case_insensitive(text, query):
"""不区分大小写的搜索"""
return query.casefold() in text.casefold()
print(search_case_insensitive("Hello World", "hello")) # 输出:True
print(search_case_insensitive("Naïve", "naive")) # 输出:True
# 性能对比:casefold vs lower
import time
text = ("Hello World Python " * 1000).casefold()
query = "world"
iterations = 100000
# 使用 lower()
start = time.time()
for _ in range(iterations):
query.lower() in text
time1 = time.time() - start
# 使用 casefold()
start = time.time()
for _ in range(iterations):
query.casefold() in text
time2 = time.time() - start
print(f"lower():{time1:.4f}s,casefold():{time2:.4f}s")
五、综合实战:完整的数据处理流程
综合案例1:解析和验证用户数据
import re
from collections import defaultdict
def parse_and_validate_user_data(csv_data):
"""
解析和验证 CSV 格式的用户数据
输入格式:
name,email,phone,age
张三,zhangsan@example.com,13800138000,25
李四,lisi@test.org,15900139000,30
"""
lines = csv_data.strip().split("\n")
headers = [h.strip() for h in lines[0].split(",")]
users = []
errors = []
for i, line in enumerate(lines[1:], start=2):
fields = [f.strip() for f in line.split(",")]
if len(fields) != len(headers):
errors.append(f"行 {i}:字段数不匹配")
continue
user = dict(zip(headers, fields))
# 验证邮箱
email_pattern = re.compile(r"^[\w\.-]+@[\w\.-]+\.\w+$")
ifnot email_pattern.match(user["email"]):
errors.append(f"行 {i}:邮箱格式错误 - {user['email']}")
continue
# 验证电话
ifnot user["phone"].isdigit() or len(user["phone"]) != 11:
errors.append(f"行 {i}:电话格式错误 - {user['phone']}")
continue
# 验证年龄
try:
age = int(user["age"])
ifnot18 <= age <= 100:
errors.append(f"行 {i}:年龄应在 18-100 之间")
continue
except ValueError:
errors.append(f"行 {i}:年龄应为数字 - {user['age']}")
continue
user["age"] = age
users.append(user)
return {
"valid_users": users,
"errors": errors,
"summary": f"成功:{len(users)} 条,失败:{len(errors)} 条"
}
# 使用示例
csv_data = """
name,email,phone,age
张三,zhangsan@example.com,13800138000,25
李四,invalid-email,15900139000,30
王五,wangwu@test.org,159001390,35
赵六,zhaoliu@test.org,18600136000,120
"""
result = parse_and_validate_user_data(csv_data)
print(result["summary"])
for error in result["errors"]:
print(f" ❌ {error}")
for user in result["valid_users"]:
print(f" ✅ {user['name']} - {user['email']}")
综合案例2:日志分析和统计
import re
from collections import Counter
def analyze_log_file(log_text):
"""
分析日志文件,提取关键信息
日志格式:
[2024-01-15 10:30:45] INFO: Server started
[2024-01-15 10:30:50] ERROR: Connection failed
"""
# 定义日志模式
log_pattern = re.compile(
r"\[(?P<timestamp>.*?)\]\s+(?P<level>\w+):\s+(?P<message>.*)"
)
logs = []
level_count = Counter()
for line in log_text.strip().split("\n"):
match = log_pattern.match(line)
ifnot match:
continue
log_entry = match.groupdict()
logs.append(log_entry)
level_count[log_entry["level"]] += 1
# 查找错误消息
errors = [log for log in logs if log["level"] == "ERROR"]
# 统计信息
return {
"total_logs": len(logs),
"level_distribution": dict(level_count),
"errors": errors,
"error_count": len(errors),
"error_types": Counter(e["message"].split(":")[0] for e in errors)
}
# 使用示例
log_text = """
[2024-01-15 10:30:45] INFO: Server started
[2024-01-15 10:30:50] ERROR: Connection failed
[2024-01-15 10:31:00] WARNING: Memory usage high
[2024-01-15 10:31:05] ERROR: Connection failed
[2024-01-15 10:31:10] INFO: Request processed
"""
result = analyze_log_file(log_text)
print(f"总日志数:{result['total_logs']}")
print(f"日志级别分布:{result['level_distribution']}")
print(f"错误数:{result['error_count']}")
print(f"错误类型:{result['error_types']}")
综合案例3:URL 解析和清理
import re
from urllib.parse import urlparse, parse_qs
def analyze_urls(url_list):
"""
分析和清理 URL 列表
"""
url_pattern = re.compile(
r"https?://(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&//=]*)"
)
valid_urls = []
domains = Counter()
for url in url_list:
# 提取 URL
url = url.strip()
ifnot url_pattern.match(url):
continue
# 解析 URL
parsed = urlparse(url)
domain = parsed.netloc.replace("www.", "")
domains[domain] += 1
# 解析查询参数
params = parse_qs(parsed.query)
valid_urls.append({
"url": url,
"domain": domain,
"path": parsed.path,
"params": params
})
return {
"total_urls": len(valid_urls),
"unique_domains": len(domains),
"top_domains": domains.most_common(5),
"urls": valid_urls
}
# 使用示例
urls = [
"https://www.example.com/path?key=value",
"http://test.org/api/users?id=123&type=admin",
"invalid-url",
"https://github.com/repository"
]
result = analyze_urls(urls)
print(f"有效 URL:{result['total_urls']}")
print(f"独特域名:{result['unique_domains']}")
print(f"顶级域名:{result['top_domains']}")
六、性能优化总结
场景1:大规模字符串拼接
# ❌ 不好(时间复杂度 O(n²))
result = ""
for i in range(10000):
result += f"Item {i}, "
# ✅ 好(时间复杂度 O(n))
result = ", ".join(f"Item {i}" for i in range(10000))
# 性能提升:100 倍以上
场景2:多次替换操作
# ❌ 不好(每次都遍历字符串)
text = "a" * 1000000
for char in "abcdefg":
text = text.replace(char, "x")
# ✅ 好(使用 translate,一次遍历)
trans = str.maketrans("abcdefg", "xxxxxxx")
text = text.translate(trans)
# 性能提升:10 倍以上
场景3:频繁的正则匹配
# ❌ 不好(每次都编译)
import re
for email in emails:
if re.match(r"^[\w\.-]+@[\w\.-]+\.\w+$", email):
pass
# ✅ 好(预编译)
pattern = re.compile(r"^[\w\.-]+@[\w\.-]+\.\w+$")
for email in emails:
if pattern.match(email):
pass
# 性能提升:2-3 倍
七、20 个操作速查表
|
|
|
|
|
|
|---|---|---|---|---|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
八、最佳实践建议
✅ 做这些事
-
使用 f-string —— 最新、最快、最可读 -
用 join() 拼接 —— 永远不要用 + 连接多个字符串 -
预编译正则 —— 频繁匹配时必须预编译 -
使用 strip() —— 清理用户输入数据 -
选择合适的检验方法 —— isdigit、isalpha 等 -
用 translate —— 大规模字符替换时效率最高 -
编码统一 —— 优先使用 UTF-8 -
验证输入 —— 始终验证外部输入
总结
这 20 个字符串操作涵盖了 Python 中 95% 的实际应用场景。关键是要理解:
-
基础操作(1-5):是所有字符串处理的基础 -
高效操作(6-7):join 和 split 是性能的关键 -
验证操作(11):确保数据质量 -
正则表达式(13-16):处理复杂匹配的利器 -
性能优化(translate、compile):处理大规模数据的必备

