1.1 查找文本中的模式
1 import re 2 pattern = 'this' 3 text = 'Does this text match the pattern?' 4 match = re.search(pattern,text) 5 6 s = match.start() 7 e = match.end() 8 9 print(match.re.pattern,match.string,s,e,text[s:e])
1.2 编译表达式
1 #!/usr/bin/env python 2 # -*- coding:utf-8 -*- 3 import re 4 5 regexes = [re.compile(p) for p in ['this','that']] 6 text = 'Does this text match the pattern?' 7 print("Text: %r\n",text) 8 9 for regex in regexes: 10 print('Seeking "%s" ->' % regex.pattern) 11 if regex.search(text): 12 print('Match') 13 else: 14 print("No match!")
1.3 多重匹配
1 import re 2 text = 'abbaaabbbbaaaaa' 3 pattern = 'ab' 4 5 for match in re.finditer(pattern,text): 6 if match: 7 print('Found %s,start is %d,end is %d' % \ 8 (text[match.start():match.end()],match.start(),match.end()))
1.4模式语法
1 #!/usr/bin/env python 2 # -*- coding:utf-8 -*- 3 import re 4 5 def find_patterns(text,patterns=[]): 6 for pattern,desc in patterns: 7 print("Pattern %r (%s)" % (pattern,desc)) 8 print("%r" % text) 9 match_compile = re.compile(pattern) 10 for match in match_compile.finditer(text): 11 print("%s%r" % ('*' * match.start(),text[match.start():match.end()])) 12 return 13 14 15 if __name__ == "__main__": 16 # -----------------------重复----------------------------- 17 # python正则表达式有5种重复的方式,如下: 18 # 1.星号(*)表示前面的模式重复0次或无限次 19 # 2.加与(+)表示前面的模式重复1次或无限次(至少一次) 20 # 3.问号(?)表示前面的模式重复0次或1次 21 # 4.大括号({m,n})表示前面的模式重复次数为:m<=x<=n 22 # 5.大括号({m,})表示前面的模式至少重复m次 23 find_patterns( 24 'abbaabbba', 25 [('ab*', 'a后面没有b或无限个b'), # abb|a|abbb|a 26 ('ab+', 'a后面至少有一个b'), # abb|abbb 27 ('ab?', 'a后面有0个或1个b'), # ab|a|ab|a 28 ('ab{3}', 'a后面有3个b'), # abbb 29 ('ab{2,3}', 'a后面b的个数为大于2个且小于3个') # abb|abbb 30 ] 31 ) 32 33 # -----------------------关闭贪婪模式----------------------------- 34 #正则表达式在匹配模式时采用的是贪婪算法,即尽可能多的匹配,这是很多 35 #书及网上资料的解释,这种说法很片面。贪婪匹配正确的理解应该是这样的: 36 #当尽可能多的匹配及尽可能少的匹配都匹配时,取可能多的匹配或者尽可能 37 #多的匹配不匹配时取尽可能少的匹配;关闭贪婪模式时,当尽可能多的匹配 38 #及尽可能少的匹配都匹配时取尽可能少的匹配,当尽可能少的匹配不匹配时 39 #取尽可能多的匹配.通配符为*、+,?,但可以关闭这种算法,即心可能少的匹 40 # 配,在后面加?即可,对应的能配符为*?、+?、??,{m,n}? 41 find_patterns( 42 'abbaabbba', 43 [('ab*?', 'a后面没有b或无限个b'), # a|a|a|a 44 ('ab+?', 'a后面至少有一个b'), # ab|ab 45 ('ab??', 'a后面有0个或1个b'), # a|a|a|a 46 ('ab{3}?', 'a后面有3个b'), # abbb 47 ('ab{2,3}?', 'a后面b的个数为大于2个且小于3个') # abb|abb 48 ] 49 ) 50 #从上面的例子中可以看到贪婪算法对ab{3}不起作用,这点值得注意 51 52 53 # -----------------------字符集[]----------------------------- 54 #1.匹配模式匹配里面任何一个字符即可,例如[ab]匹配a或b 55 #2.很多特殊字符在字符集里将失去原来的意义,如+、. 56 find_patterns( 57 'abbaabbba', 58 [('[ab]', '匹配a或b'), # a|b|b|a|a|b|b|b|a 59 ('a[ab]+', 'a后面匹配1个或多个a或b'), # abbaabbba 60 ('a[ab]+?', 'a后面匹配1个a或b') # ab|aa 61 ] 62 ) 63 64 # -----------------------字符集区间----------------------------- 65 #随着字符集变得越来越大,单个匹配会变得很枯燥,可以利用一种更为紧凑 66 #的格式:区间 67 find_patterns( 68 'This is some text -- with punctuation.', 69 [('[a-z]+','匹配一个或多个小写字母'), #his|is|some|text|with|punctuation 70 ('[A-Z]+', '匹配一个或多个大写字母'),#T 71 ('[a-zA-Z]+', '匹配一个或多个小写字母或大写字母'),#This|is|some|text|with|punctuation 72 ('[A-Z][a-z]+', '一个大写字母后面匹配一个或多个小写字母')#This 73 ] 74 ) 75 # -----------------------元字符点号(.)----------------------------- 76 #元字符点号(.)后面匹配单个字符,单行模式中不匹配换行符 77 find_patterns( 78 'abbabbbba', 79 [('a.', '匹配a或b'), # ab|aa 80 ('b.', 'a后面匹配1个或多个a或b'), # bb|bb|ba 81 ('a.*b', 'a后面匹配1个a或b'), # abbaabbb 82 ('a.*?b','匹配ab或') # ab|aab,为什么最后一个匹配是aab,请参考"关闭贪婪模式"那段话 83 ] 84 ) 85 86 # -----------------------^----------------------------- 87 #1.在字符集([])里^表示排除某些字符 88 find_patterns( 89 'This is some text -- with punctuation.', 90 [('[^-. ]+','排除横杠,点号或空格')] #This|is|some|text|with|punctuation 91 ) 92 93 # -----------------------转义码----------------------------- 94 #1.\d:一个数字,同[0-9] 95 #2.\D:非数字,同[^0-9] 96 #3.\w:字母或数字,同[0-9a-zA-Z] 97 #4.\W:非字母数字,同[^0-9a-zA-Z] 98 #5.\s:空白字符,制表符、窗格、换行符 99 #6.\S:非空白字符 100 find_patterns( 101 'A prime #1 example!', 102 [(r'\d+','匹配一个或多个数字'),#1 103 (r'\D+','匹配一个或多个除数字以外的多个字符'),#A prime #| example! 104 (r'\s+','匹配一个或多个空白字符'),#' '|' '|' ' 105 (r'\S+','匹配一个或多个非空白字符'),#A|prime|#1|example! 106 (r'\w+','匹配一个或多个数字字符'),#A|prime|example 107 (r'\W+','匹配一个或多个非数字字符'),#' '| #|' '|! 108 ] 109 ) 110 111 # -----------------------匹配元字符----------------------------- 112 #如果要匹配正则表达式中的字符,则需要对搜索模式的字符进行转义 113 find_patterns(r'\d+ \D+ \s+',[(r'\\.\+','匹配元字符')]) 114 115 # -----------------------锚定符----------------------------- 116 #正则表达式除了匹配模式的内容外,还可以使用锚定符指定文本在模式 117 #中的相对位置 118 #1.^匹配字符串或行的开头 119 #2.$匹配字符串或行的末尾 120 #3.\A匹配字符串或行的开头 121 #4.\Z匹配字符串或行的末尾 122 #5.\b匹配一个单词的开头或末尾的空串 123 #6.\B不匹配一个单词的开头或末尾的空串 124 find_patterns( 125 'This is some text -- with punctuation.', 126 [(r'^\w+','匹配以字母数字开头的字符串或行'), #This 127 (r'\A\w+', '匹配以字母数字开头的字符串或行'),#This 128 (r'\w+\S*$', '匹配以字母数字及非空白字符结尾的字符串或行'),#punctuation. 129 (r'\w+\S*\Z', '匹配以字母数字及非空白字符结尾的字符串或行'),#punctuation. 130 (r'\w*t\w*','匹配包含字母t的单词'),#text|with|punctuation 131 (r'\bt\w+','匹配以t开头的单词'),#text 132 (r'\w+t\b','匹配以t结尾的单词'),#text 133 (r'\Bt\B','匹配字母t且字母t不在单词的开头或结尾') #t|t|t 134 ] 135 )
1.5 组解析匹配
1 #!/usr/bin/env python 2 # -*- coding:utf-8 -*- 3 import re 4 5 def find_patterns(text,patterns=[]): 6 for pattern,desc in patterns: 7 print("Pattern %r (%s)" % (pattern,desc)) 8 print("%r" % text) 9 match_compile = re.compile(pattern) 10 for match in match_compile.finditer(text): 11 print("%s%r" % ('*' * match.start(),text[match.start():match.end()])) 12 return 13 14 15 if __name__ == "__main__": 16 # -----------------------分组()----------------------------- 17 #为模式增加分组可以隔离匹配文本的各个部分,进一步扩展这些功能 18 #来创建一个解析工具 19 find_patterns( 20 'abbaaabbbbaaaaa', 21 [('a(ab)','a后面匹配ab'),#aab 22 ('a(a*b*)','a后面匹配0-n个a或匹配0-n个b'),#abb|aaabbbb|aaaaa 23 ('a(ab)*','a后面匹配0-n个ab'),#a|a|aab|a|a|a|a|a 24 ('a(ab)+','a后面匹配1-n个ab')#aab 25 ] 26 ) 27 28 # -----------------------groups----------------------------- 29 #为了访问一个模式中单个组所匹配的子串,可以使用Match对象的groups()方法 30 #groups方法可以在match,search里使用,在finditer里不能使用 31 text ='This is text -- with some text -- with punctuation.' 32 print(text) 33 patterns = [ 34 (r'^(\w+)','匹配以字母数字开头'),#This 35 (r'(\w+)\S*$', '匹配以字母数字结尾或以非空白字符结尾'),#punctuation. 36 (r'(\bt\w+)\W+(\w+)', '匹配以t开头的单词,以数字字母结尾及中间为非数字字母'),#text -- with 37 (r'(\w+t)\b', '匹配t结尾的单词')#text 38 ] 39 40 for pattern,desc in patterns: 41 regex = re.compile(pattern) 42 match = regex.search(text) 43 print('Pattern %r (%s)' % (pattern,desc)) 44 print('group is:',match.group(),'group(0) is:',match.group(0)) 45 print('groups is:',match.groups()) 46 47 48 # -----------------------name group----------------------------- 49 #groups将分组存放到一个元组里,其实也可以将分组放到一个字典里 50 text ='This is text -- with some text -- with punctuation.' 51 print(text) 52 patterns = [ 53 (r'^(?P<first_word>\w+)','匹配以字母数字开头'),#This 54 (r'(?P<last_word>\w+)\S*$', '匹配以字母数字结尾或以非空白字符结尾'),#punctuation. 55 (r'(?P<t_word>\bt\w+)\W+(?P<other_word>\w+)', '匹配以t开头的单词,以数字字母结尾及中间为非数字字母'),#text -- with 56 (r'(?P<ends_with_t>\w+t)\b', '匹配t结尾的单词')#text 57 ] 58 59 for pattern,desc in patterns: 60 regex = re.compile(pattern) 61 match = regex.search(text) 62 print('Pattern %r (%s)' % (pattern,desc)) 63 print('group is:',match.group(),'group(0) is:',match.group(0),'match(1) is:',match.group(1)) 64 print('groups is:',match.groups()) 65 print('groupdict is:',match.groupdict()) 66 67 68 def find_patterns_dict(text, patterns=[]): 69 for pattern, desc in patterns: 70 print("Pattern %r (%s)" % (pattern, desc)) 71 print("%r" % text) 72 match_compile = re.compile(pattern) 73 for match in match_compile.finditer(text): 74 prefix = ' ' * match.start() 75 print("%s%r%s" % (prefix, text[match.start():match.end()],' ' * (len(text)-match.end()))) 76 # print('This groups is:', match.groups()) if match.groups() else '' 77 if match.groups(): 78 print('This groups is:',match.groups()) 79 if match.groupdict(): 80 print("%s%s" % (' ' * (len(text)-match.start()),match.groupdict())) 81 return 82 83 find_patterns_dict( 84 'abbaabbba', 85 [(r'a((a*)(b*))','1')] #abb('bb','','bb')|aabbb('abbb','a','bb')|a('','','') 86 ) 87 88 # -----------------------管道符号|----------------------------- 89 #a((a+)|(b+)),表示a后面只匹配由a或b一个字母构成的序列 90 #a((a|b)+),表示a后面匹配可能包含a或b的序列,两者不同,请看下面的实例 91 find_patterns_dict( 92 'abbaabbba', 93 [(r'a((a+)|(b+))','a后面只匹配a或b'),#abb('bb','','bb')|aa('a','a','') 94 (r'a((a|b)+)', 'a后面可能匹配a或b等同于[ab]') #abbaabbba('bbaabbba','a') 95 ] 96 ) 97 # -----------------------非捕获组----------------------------- 98 #将分组屏蔽,使其不在groups里 99 find_patterns_dict( 100 'abbaabbba', 101 [(r'a((a+)|(b+))','a后面只匹配a或b'), #abb('bb','','bb')|aa('a','a','') 102 (r'a((?:a+)|(?:b+))', 'a后面可能匹配a或b等同于[ab]'), #abb(bb,)|aa('a',) 103 (r'a(?:ab)+', 'a后面可能匹配a或b等同于[ab]') #aab 104 ] 105 )
1.6 搜索选项
1 # ----------------------------不区分大小写------------------------------- 2 import re 3 text = 'This is some text -- with punctuatuion.' 4 pattern = r'\bT\w+' 5 with_case = re.compile(pattern) 6 without_case = re.compile(pattern,re.IGNORECASE) 7 8 print('Text:\n %r' % text) 9 print('Pattern:\n %s'% pattern) 10 print('Case-sensitive:') 11 for match in with_case.findall(text): 12 print(' %r' % match) 13 print('Case-insensitive:') 14 for match in without_case.findall(text): 15 print(' %r' % match) 16 17 # ----------------------------多行输入------------------------------- 18 text = 'This is some text -- with punctuatuion.\nA second line.' 19 pattern = r'(^\w+)|(\w+\S*$)' 20 single_line = re.compile(pattern) 21 multiline = re.compile(pattern,re.MULTILINE) 22 23 print('Text:\n %r' % text) 24 print('Pattern:\n %s'% pattern) 25 print('Single Line:') 26 for match in single_line.findall(text): 27 print(' %r' % (match,)) 28 print('multiline:') 29 for match in multiline.findall(text): 30 print(' %r' % (match,)) 31 32 # ----------------------------DOTALL------------------------------- 33 #匹配换行符 34 text = 'This is some text -- with punctuatuion.\nA second line.' 35 pattern = r'.+' 36 no_newlines = re.compile(pattern) 37 dotall = re.compile(pattern,re.DOTALL) 38 39 print('Text:\n %r' % text) 40 print('Pattern:\n %s'% pattern) 41 print('Single Line:') 42 for match in no_newlines.findall(text): 43 print(' %r' % (match,)) 44 print('multiline:') 45 for match in dotall.findall(text): 46 print(' %r' % (match,))
1.7 匹配邮箱
1 #!/usr/bin/env python 2 # -*- coding:utf-8 -*- 3 import re 4 5 # ----------------------------匹配邮箱V1.0------------------------------- 6 #这个正则表达式不完善,比如后缀为com,org,edu三者组合的也会匹配,这种匹配 7 #不够严谨,因此最后加一个$可以修正这个bug 8 address = re.compile('[\w\d.+-]+@([\w\d.]+\.)+(com|org|edu)$') 9 10 candidates = [ 11 'first.last@example.com', 12 'first.last+category@gmail.org', 13 'valid-address@mail.example.edu', 14 'not-valid@examle.foo' 15 ] 16 for candidate in candidates: 17 match = address.search(candidate) 18 print('%-30s %s' % (candidate,'Matches' if match else 'No match')) 19 if match: 20 print(match.group(), match.groups()) 21 22 # ----------------------------匹配邮箱V2.0------------------------------- 23 #使用re.VERBOSE 24 address = re.compile( 25 ''' 26 [\w\d.+-]+ #用户名 27 @ 28 ([\w\d.]+\.)+ #域名 29 (com|org|edu) #顶级域名 30 ''',re.VERBOSE 31 ) 32 33 candidates = [ 34 'first.last@example.com', 35 'first.last+category@gmail.edu', 36 'valid-address@mail.example.org', 37 'not-valid@examle.foo' 38 ] 39 for candidate in candidates: 40 match = address.search(candidate) 41 print('%-30s %s' % (candidate,'Matches' if match else 'No match')) 42 if match: 43 print(match.group(), match.groups()) 44 45 46 # ----------------------------匹配邮箱V3.0------------------------------- 47 # 48 address = re.compile( 49 ''' 50 #匹配人名 51 ((?P<name> 52 ([\w.,]+\s+)*[\w.,]+) 53 \s* 54 < 55 )? 56 #匹配地址 57 (?P<email> 58 [\w\d.+-]+ 59 @ 60 ([\w\d.]+\.)+ 61 (com|org|edu) 62 ) 63 >? 64 ''',re.VERBOSE 65 ) 66 # 67 # address1 = re.compile('((?P<name>([\w.,]+\s+)*[\w.,]+)\s*<)?(?P<email>[\w\d.+-]+@([\w\d.]+\.)+(com|org|edu))>?$') 68 # 69 # 70 candidates = [ 71 'first.last@example.com', 72 'first.last+category@gmail.edu', 73 'valid-address@mail.example.org', 74 'not-valid@examle.foo', 75 'First Last <first.last@example.com>', 76 'No Brackets first.last@example.com', 77 'First Last', 78 'First Middle Last <frist.last@example.com>', 79 'First M. Last <First.last@example.com>', 80 '<first.last@example.com>' 81 ] 82 83 for candidate in candidates: 84 print('candidate is:',candidate) 85 match = address.search(candidate) 86 if match: 87 print(' Name:',match.groupdict()['name']) 88 print(' Email:',match.groupdict()['email']) 89 print(' The group is:',match.group()) 90 print(' The groups is:', match.groups()) 91 92 else: 93 print('No match!') 94 95 # ----------------------------模式中嵌套标志------------------------------- 96 #若编译表达式里不能增加标志,则可以将标志嵌入到表达式字符串本身 97 #python所以标志的缩写如下: 98 #IGNORECASE->i 99 #MULTILINE->m 100 #DOTALL->s 101 #UNICODE->u 102 #VERBOSE->x 103 import re 104 text = 'This is some text -- with punctuation.' 105 pattern = r'(?i)\b\T\w+' 106 regex = re.compile(pattern) 107 108 print('The Text is:',text) 109 print('The Pattern is:',pattern) 110 print('The match is:',regex.findall(text))
1.8正则表达式的高级用法
1 #!/usr/bin/env python 2 # -*- coding:utf-8 -*- 3 import re 4 address1 = re.compile( 5 ''' 6 #name match 7 ^(?P<name>([\w.]+\s+)*[\w.]+)?\s* 8 9 (?(name) 10 (?P<last_name>(?=(<.*>$))) 11 | 12 (?=([^<].*[^>]$)) 13 ) 14 (?(last_name)<|\s*) 15 #email match 16 (?P<email> 17 [\w\d.+-]+ 18 @ 19 ([\w\d.]+\.)+ 20 (com|org|edu) 21 ) 22 (?(last_name)>|\s*) 23 $ 24 ''',re.VERBOSE|re.IGNORECASE 25 ) 26 27 candidates = [ 28 'First Last <first.last@example1.com>', 29 'first last@example.com', 30 'first.last first.last@example.com', 31 'Open Bracket <first.last@example.com', 32 'Close Bracket frist.last@example.com>' 33 ] 34 35 for candidate in candidates: 36 print('candidate is:',candidate) 37 match = address.search(candidate) 38 if match: 39 print(' Name:',match.groupdict()['name']) 40 print(' Email:',match.groupdict()['email']) 41 print(' The group is:',match.group()) 42 print(' The groups is:', match.groups()) 43 else: 44 print('No match!') 45 46 47 48 49 if __name__ == "__main__": 50 pass
1.9修改字符串
1 #version 0.1
import re text = '''Paragraph one on two lines. Paragraph two. Paragraph three.''' for num,para in enumerate(re.findall(r'(.+?)(\n{2,}|$)',text,flags=re.DOTALL)): print(num,repr(para))
2 import re 3 bold = re.compile(r'\*{2}(.*?)\*{2}') 4 text = 'Make this **bold**. This **too**.' 5 print('Text:',text) 6 print('Bold:',bold.sub(r'<b>\1</b>',text)) 7 8 #version 0.2 9 import re 10 bold = re.compile(r'\*{2}(?P<name>.*?)\*{2}') 11 text = 'Make this **bold**. This **too**.' 12 print('Text:',text) 13 print('Bold:',bold.sub(r'<b>\g<name></b>',text)) 14 15 #version 0.3 16 import re 17 bold = re.compile(r'\*{2}(.*?)\*{2}') 18 text = 'Make this **bold**. This **too**.' 19 print('Text:',text) 20 print('Bold:',bold.sub(r'<b>\1</b>',text,count=2))
1.10 Split
import re text = '''Paragraph one on two lines. Paragraph two. Paragraph three.''' for num,para in enumerate(re.findall(r'(.+?)(\n{2,}|$)',text,flags=re.DOTALL)): print(num,repr(para))
来源:https://www.cnblogs.com/chencsj/p/7967606.html