Python 正则表式

大兔子大兔子 提交于 2019-12-29 21:33:20

1.1 查找文本中的模式

1 import re
2 pattern = 'this'
3 text = 'Does this text match the pattern?'
4 match = re.search(pattern,text)
5 
6 s = match.start()
7 e = match.end()
8 
9 print(match.re.pattern,match.string,s,e,text[s:e])

 

1.2 编译表达式

 1 #!/usr/bin/env python
 2 # -*- coding:utf-8 -*-
 3 import re
 4 
 5 regexes = [re.compile(p) for p in ['this','that']]
 6 text = 'Does this text match the pattern?'
 7 print("Text: %r\n",text)
 8 
 9 for regex in regexes:
10     print('Seeking "%s" ->' % regex.pattern)
11     if regex.search(text):
12         print('Match')
13     else:
14         print("No match!")

 

 1.3 多重匹配

1 import re
2 text = 'abbaaabbbbaaaaa'
3 pattern = 'ab'
4 
5 for match in re.finditer(pattern,text):
6     if match:
7         print('Found %s,start is %d,end is %d' % \
8               (text[match.start():match.end()],match.start(),match.end()))

 

1.4模式语法

  1 #!/usr/bin/env python
  2 # -*- coding:utf-8 -*-
  3 import re
  4 
  5 def find_patterns(text,patterns=[]):
  6     for pattern,desc in patterns:
  7         print("Pattern %r (%s)" % (pattern,desc))
  8         print("%r" % text)
  9         match_compile = re.compile(pattern)
 10         for match  in match_compile.finditer(text):
 11             print("%s%r" % ('*' * match.start(),text[match.start():match.end()]))
 12     return
 13 
 14 
 15 if __name__ == "__main__":
 16     # -----------------------重复-----------------------------
 17     # python正则表达式有5种重复的方式,如下:
 18     # 1.星号(*)表示前面的模式重复0次或无限次
 19     # 2.加与(+)表示前面的模式重复1次或无限次(至少一次)
 20     # 3.问号(?)表示前面的模式重复0次或1次
 21     # 4.大括号({m,n})表示前面的模式重复次数为:m<=x<=n
 22     # 5.大括号({m,})表示前面的模式至少重复m次
 23     find_patterns(
 24         'abbaabbba',
 25         [('ab*', 'a后面没有b或无限个b'),  # abb|a|abbb|a
 26          ('ab+', 'a后面至少有一个b'),  # abb|abbb
 27          ('ab?', 'a后面有0个或1个b'),  # ab|a|ab|a
 28          ('ab{3}', 'a后面有3个b'),  # abbb
 29          ('ab{2,3}', 'a后面b的个数为大于2个且小于3个')  # abb|abbb
 30          ]
 31     )
 32 
 33     # -----------------------关闭贪婪模式-----------------------------
 34     #正则表达式在匹配模式时采用的是贪婪算法,即尽可能多的匹配,这是很多
 35     #书及网上资料的解释,这种说法很片面。贪婪匹配正确的理解应该是这样的:
 36     #当尽可能多的匹配及尽可能少的匹配都匹配时,取可能多的匹配或者尽可能
 37     #多的匹配不匹配时取尽可能少的匹配;关闭贪婪模式时,当尽可能多的匹配
 38     #及尽可能少的匹配都匹配时取尽可能少的匹配,当尽可能少的匹配不匹配时
 39     #取尽可能多的匹配.通配符为*、+,?,但可以关闭这种算法,即心可能少的匹
 40     # 配,在后面加?即可,对应的能配符为*?、+?、??,{m,n}?
 41     find_patterns(
 42         'abbaabbba',
 43         [('ab*?', 'a后面没有b或无限个b'),  # a|a|a|a
 44          ('ab+?', 'a后面至少有一个b'),  # ab|ab
 45          ('ab??', 'a后面有0个或1个b'),  # a|a|a|a
 46          ('ab{3}?', 'a后面有3个b'),  # abbb
 47          ('ab{2,3}?', 'a后面b的个数为大于2个且小于3个')  # abb|abb
 48          ]
 49     )
 50     #从上面的例子中可以看到贪婪算法对ab{3}不起作用,这点值得注意
 51 
 52 
 53     # -----------------------字符集[]-----------------------------
 54     #1.匹配模式匹配里面任何一个字符即可,例如[ab]匹配a或b
 55     #2.很多特殊字符在字符集里将失去原来的意义,如+、.
 56     find_patterns(
 57         'abbaabbba',
 58         [('[ab]', '匹配a或b'),  # a|b|b|a|a|b|b|b|a
 59          ('a[ab]+', 'a后面匹配1个或多个a或b'),  # abbaabbba
 60          ('a[ab]+?', 'a后面匹配1个a或b')  # ab|aa
 61          ]
 62     )
 63 
 64     # -----------------------字符集区间-----------------------------
 65     #随着字符集变得越来越大,单个匹配会变得很枯燥,可以利用一种更为紧凑
 66     #的格式:区间
 67     find_patterns(
 68         'This is some text -- with punctuation.',
 69         [('[a-z]+','匹配一个或多个小写字母'), #his|is|some|text|with|punctuation
 70          ('[A-Z]+', '匹配一个或多个大写字母'),#T
 71          ('[a-zA-Z]+', '匹配一个或多个小写字母或大写字母'),#This|is|some|text|with|punctuation
 72          ('[A-Z][a-z]+', '一个大写字母后面匹配一个或多个小写字母')#This
 73         ]
 74     )
 75     # -----------------------元字符点号(.)-----------------------------
 76     #元字符点号(.)后面匹配单个字符,单行模式中不匹配换行符
 77     find_patterns(
 78         'abbabbbba',
 79         [('a.', '匹配a或b'),  # ab|aa
 80          ('b.', 'a后面匹配1个或多个a或b'),  # bb|bb|ba
 81          ('a.*b', 'a后面匹配1个a或b'),  # abbaabbb
 82          ('a.*?b','匹配ab或') # ab|aab,为什么最后一个匹配是aab,请参考"关闭贪婪模式"那段话
 83          ]
 84     )
 85 
 86     # -----------------------^-----------------------------
 87     #1.在字符集([])里^表示排除某些字符
 88     find_patterns(
 89         'This is some text -- with punctuation.',
 90         [('[^-. ]+','排除横杠,点号或空格')] #This|is|some|text|with|punctuation
 91     )
 92 
 93     # -----------------------转义码-----------------------------
 94     #1.\d:一个数字,同[0-9]
 95     #2.\D:非数字,同[^0-9]
 96     #3.\w:字母或数字,同[0-9a-zA-Z]
 97     #4.\W:非字母数字,同[^0-9a-zA-Z]
 98     #5.\s:空白字符,制表符、窗格、换行符
 99     #6.\S:非空白字符
100     find_patterns(
101         'A prime #1 example!',
102         [(r'\d+','匹配一个或多个数字'),#1
103          (r'\D+','匹配一个或多个除数字以外的多个字符'),#A prime #| example!
104          (r'\s+','匹配一个或多个空白字符'),#' '|' '|' '
105          (r'\S+','匹配一个或多个非空白字符'),#A|prime|#1|example!
106          (r'\w+','匹配一个或多个数字字符'),#A|prime|example
107          (r'\W+','匹配一个或多个非数字字符'),#' '| #|' '|!
108         ]
109     )
110 
111     # -----------------------匹配元字符-----------------------------
112     #如果要匹配正则表达式中的字符,则需要对搜索模式的字符进行转义
113     find_patterns(r'\d+ \D+ \s+',[(r'\\.\+','匹配元字符')])
114 
115     # -----------------------锚定符-----------------------------
116     #正则表达式除了匹配模式的内容外,还可以使用锚定符指定文本在模式
117     #中的相对位置
118     #1.^匹配字符串或行的开头
119     #2.$匹配字符串或行的末尾
120     #3.\A匹配字符串或行的开头
121     #4.\Z匹配字符串或行的末尾
122     #5.\b匹配一个单词的开头或末尾的空串
123     #6.\B不匹配一个单词的开头或末尾的空串
124     find_patterns(
125         'This is some text -- with punctuation.',
126         [(r'^\w+','匹配以字母数字开头的字符串或行'), #This
127          (r'\A\w+', '匹配以字母数字开头的字符串或行'),#This
128          (r'\w+\S*$', '匹配以字母数字及非空白字符结尾的字符串或行'),#punctuation.
129          (r'\w+\S*\Z', '匹配以字母数字及非空白字符结尾的字符串或行'),#punctuation.
130          (r'\w*t\w*','匹配包含字母t的单词'),#text|with|punctuation
131          (r'\bt\w+','匹配以t开头的单词'),#text
132          (r'\w+t\b','匹配以t结尾的单词'),#text
133          (r'\Bt\B','匹配字母t且字母t不在单词的开头或结尾') #t|t|t
134         ]
135     )

 

1.5 组解析匹配

 

  1 #!/usr/bin/env python
  2 # -*- coding:utf-8 -*-
  3 import re
  4 
  5 def find_patterns(text,patterns=[]):
  6     for pattern,desc in patterns:
  7         print("Pattern %r (%s)" % (pattern,desc))
  8         print("%r" % text)
  9         match_compile = re.compile(pattern)
 10         for match  in match_compile.finditer(text):
 11             print("%s%r" % ('*' * match.start(),text[match.start():match.end()]))
 12     return
 13 
 14 
 15 if __name__ == "__main__":
 16     # -----------------------分组()-----------------------------
 17     #为模式增加分组可以隔离匹配文本的各个部分,进一步扩展这些功能
 18     #来创建一个解析工具
 19     find_patterns(
 20         'abbaaabbbbaaaaa',
 21         [('a(ab)','a后面匹配ab'),#aab
 22          ('a(a*b*)','a后面匹配0-n个a或匹配0-n个b'),#abb|aaabbbb|aaaaa
 23          ('a(ab)*','a后面匹配0-n个ab'),#a|a|aab|a|a|a|a|a
 24          ('a(ab)+','a后面匹配1-n个ab')#aab
 25         ]
 26     )
 27 
 28     # -----------------------groups-----------------------------
 29     #为了访问一个模式中单个组所匹配的子串,可以使用Match对象的groups()方法
 30     #groups方法可以在match,search里使用,在finditer里不能使用
 31     text ='This is text -- with some text -- with punctuation.'
 32     print(text)
 33     patterns = [
 34         (r'^(\w+)','匹配以字母数字开头'),#This
 35         (r'(\w+)\S*$', '匹配以字母数字结尾或以非空白字符结尾'),#punctuation.
 36         (r'(\bt\w+)\W+(\w+)', '匹配以t开头的单词,以数字字母结尾及中间为非数字字母'),#text -- with
 37         (r'(\w+t)\b', '匹配t结尾的单词')#text
 38     ]
 39 
 40     for pattern,desc in patterns:
 41         regex = re.compile(pattern)
 42         match = regex.search(text)
 43         print('Pattern %r (%s)' % (pattern,desc))
 44         print('group is:',match.group(),'group(0) is:',match.group(0))
 45         print('groups is:',match.groups())
 46 
 47 
 48     # -----------------------name group-----------------------------
 49     #groups将分组存放到一个元组里,其实也可以将分组放到一个字典里
 50     text ='This is text -- with some text -- with punctuation.'
 51     print(text)
 52     patterns = [
 53         (r'^(?P<first_word>\w+)','匹配以字母数字开头'),#This
 54         (r'(?P<last_word>\w+)\S*$', '匹配以字母数字结尾或以非空白字符结尾'),#punctuation.
 55         (r'(?P<t_word>\bt\w+)\W+(?P<other_word>\w+)',  '匹配以t开头的单词,以数字字母结尾及中间为非数字字母'),#text -- with
 56         (r'(?P<ends_with_t>\w+t)\b', '匹配t结尾的单词')#text
 57     ]
 58 
 59     for pattern,desc in patterns:
 60         regex = re.compile(pattern)
 61         match = regex.search(text)
 62         print('Pattern %r (%s)' % (pattern,desc))
 63         print('group is:',match.group(),'group(0) is:',match.group(0),'match(1) is:',match.group(1))
 64         print('groups is:',match.groups())
 65         print('groupdict is:',match.groupdict())
 66 
 67 
 68     def find_patterns_dict(text, patterns=[]):
 69         for pattern, desc in patterns:
 70             print("Pattern %r (%s)" % (pattern, desc))
 71             print("%r" % text)
 72             match_compile = re.compile(pattern)
 73             for match in match_compile.finditer(text):
 74                 prefix = ' ' * match.start()
 75                 print("%s%r%s" % (prefix, text[match.start():match.end()],' ' * (len(text)-match.end())))
 76                 # print('This groups is:', match.groups()) if match.groups() else ''
 77                 if match.groups():
 78                     print('This groups is:',match.groups())
 79                 if match.groupdict():
 80                     print("%s%s" % (' ' * (len(text)-match.start()),match.groupdict()))
 81         return
 82 
 83     find_patterns_dict(
 84         'abbaabbba',
 85         [(r'a((a*)(b*))','1')] #abb('bb','','bb')|aabbb('abbb','a','bb')|a('','','')
 86     )
 87 
 88     # -----------------------管道符号|-----------------------------
 89     #a((a+)|(b+)),表示a后面只匹配由a或b一个字母构成的序列
 90     #a((a|b)+),表示a后面匹配可能包含a或b的序列,两者不同,请看下面的实例
 91     find_patterns_dict(
 92         'abbaabbba',
 93         [(r'a((a+)|(b+))','a后面只匹配a或b'),#abb('bb','','bb')|aa('a','a','')
 94          (r'a((a|b)+)', 'a后面可能匹配a或b等同于[ab]') #abbaabbba('bbaabbba','a')
 95          ]
 96     )
 97     # -----------------------非捕获组-----------------------------
 98     #将分组屏蔽,使其不在groups里
 99     find_patterns_dict(
100         'abbaabbba',
101         [(r'a((a+)|(b+))','a后面只匹配a或b'),  #abb('bb','','bb')|aa('a','a','')
102          (r'a((?:a+)|(?:b+))', 'a后面可能匹配a或b等同于[ab]'), #abb(bb,)|aa('a',)
103          (r'a(?:ab)+', 'a后面可能匹配a或b等同于[ab]') #aab
104          ]
105     )

 

1.6 搜索选项

 1 # ----------------------------不区分大小写-------------------------------
 2 import re
 3 text = 'This is some text -- with punctuatuion.'
 4 pattern = r'\bT\w+'
 5 with_case = re.compile(pattern)
 6 without_case = re.compile(pattern,re.IGNORECASE)
 7 
 8 print('Text:\n  %r' % text)
 9 print('Pattern:\n  %s'% pattern)
10 print('Case-sensitive:')
11 for match in with_case.findall(text):
12     print('  %r' % match)
13 print('Case-insensitive:')
14 for match in without_case.findall(text):
15     print('  %r' % match)
16 
17 # ----------------------------多行输入-------------------------------
18 text = 'This is some text -- with punctuatuion.\nA second line.'
19 pattern = r'(^\w+)|(\w+\S*$)'
20 single_line = re.compile(pattern)
21 multiline = re.compile(pattern,re.MULTILINE)
22 
23 print('Text:\n  %r' % text)
24 print('Pattern:\n  %s'% pattern)
25 print('Single Line:')
26 for match in single_line.findall(text):
27     print('  %r' % (match,))
28 print('multiline:')
29 for match in multiline.findall(text):
30     print('  %r' % (match,))
31 
32 # ----------------------------DOTALL-------------------------------
33 #匹配换行符
34 text = 'This is some text -- with punctuatuion.\nA second line.'
35 pattern = r'.+'
36 no_newlines = re.compile(pattern)
37 dotall = re.compile(pattern,re.DOTALL)
38 
39 print('Text:\n  %r' % text)
40 print('Pattern:\n  %s'% pattern)
41 print('Single Line:')
42 for match in no_newlines.findall(text):
43     print('  %r' % (match,))
44 print('multiline:')
45 for match in dotall.findall(text):
46     print('  %r' % (match,))

 1.7 匹配邮箱

  1 #!/usr/bin/env python
  2 # -*- coding:utf-8 -*-
  3 import re
  4 
  5 # ----------------------------匹配邮箱V1.0-------------------------------
  6 #这个正则表达式不完善,比如后缀为com,org,edu三者组合的也会匹配,这种匹配
  7 #不够严谨,因此最后加一个$可以修正这个bug
  8 address = re.compile('[\w\d.+-]+@([\w\d.]+\.)+(com|org|edu)$')
  9 
 10 candidates = [
 11     'first.last@example.com',
 12     'first.last+category@gmail.org',
 13     'valid-address@mail.example.edu',
 14     'not-valid@examle.foo'
 15 ]
 16 for candidate in candidates:
 17     match = address.search(candidate)
 18     print('%-30s   %s' % (candidate,'Matches' if match else 'No match'))
 19     if match:
 20         print(match.group(), match.groups())
 21 
 22 # ----------------------------匹配邮箱V2.0-------------------------------
 23 #使用re.VERBOSE
 24 address = re.compile(
 25     '''
 26    [\w\d.+-]+ #用户名
 27    @
 28    ([\w\d.]+\.)+ #域名
 29    (com|org|edu) #顶级域名
 30     ''',re.VERBOSE
 31 )
 32 
 33 candidates = [
 34     'first.last@example.com',
 35     'first.last+category@gmail.edu',
 36     'valid-address@mail.example.org',
 37     'not-valid@examle.foo'
 38 ]
 39 for candidate in candidates:
 40     match = address.search(candidate)
 41     print('%-30s   %s' % (candidate,'Matches' if match else 'No match'))
 42     if match:
 43         print(match.group(), match.groups())
 44 
 45 
 46 # ----------------------------匹配邮箱V3.0-------------------------------
 47 #
 48 address = re.compile(
 49 '''
 50 #匹配人名
 51    ((?P<name>
 52    ([\w.,]+\s+)*[\w.,]+)
 53    \s*
 54    <
 55    )?
 56 #匹配地址
 57   (?P<email>
 58   [\w\d.+-]+
 59   @
 60   ([\w\d.]+\.)+
 61   (com|org|edu)
 62   )
 63   >?
 64 ''',re.VERBOSE
 65 )
 66 #
 67 # address1 = re.compile('((?P<name>([\w.,]+\s+)*[\w.,]+)\s*<)?(?P<email>[\w\d.+-]+@([\w\d.]+\.)+(com|org|edu))>?$')
 68 #
 69 #
 70 candidates = [
 71     'first.last@example.com',
 72     'first.last+category@gmail.edu',
 73     'valid-address@mail.example.org',
 74     'not-valid@examle.foo',
 75     'First Last <first.last@example.com>',
 76     'No Brackets first.last@example.com',
 77     'First Last',
 78     'First Middle Last <frist.last@example.com>',
 79     'First M. Last <First.last@example.com>',
 80     '<first.last@example.com>'
 81 ]
 82 
 83 for candidate in candidates:
 84     print('candidate is:',candidate)
 85     match = address.search(candidate)
 86     if match:
 87         print('  Name:',match.groupdict()['name'])
 88         print('  Email:',match.groupdict()['email'])
 89         print('  The group is:',match.group())
 90         print('  The groups is:', match.groups())
 91 
 92     else:
 93         print('No match!')
 94 
 95 # ----------------------------模式中嵌套标志-------------------------------
 96 #若编译表达式里不能增加标志,则可以将标志嵌入到表达式字符串本身
 97 #python所以标志的缩写如下:
 98 #IGNORECASE->i
 99 #MULTILINE->m
100 #DOTALL->s
101 #UNICODE->u
102 #VERBOSE->x
103 import re
104 text = 'This is some text -- with punctuation.'
105 pattern = r'(?i)\b\T\w+'
106 regex = re.compile(pattern)
107 
108 print('The Text is:',text)
109 print('The Pattern is:',pattern)
110 print('The match is:',regex.findall(text))

 

1.8正则表达式的高级用法

 1 #!/usr/bin/env python
 2 # -*- coding:utf-8 -*-
 3 import re
 4 address1 = re.compile(
 5 '''
 6 #name match
 7 ^(?P<name>([\w.]+\s+)*[\w.]+)?\s*
 8 
 9 (?(name)
10   (?P<last_name>(?=(<.*>$)))
11   |
12   (?=([^<].*[^>]$))
13 )
14 (?(last_name)<|\s*)
15 #email match
16 (?P<email>
17 [\w\d.+-]+
18 @
19 ([\w\d.]+\.)+
20 (com|org|edu)
21 )
22 (?(last_name)>|\s*)
23 $
24 ''',re.VERBOSE|re.IGNORECASE
25 )
26 
27 candidates = [
28     'First Last <first.last@example1.com>',
29     'first last@example.com',
30     'first.last first.last@example.com',
31     'Open Bracket <first.last@example.com',
32     'Close Bracket frist.last@example.com>'
33     ]
34 
35 for candidate in candidates:
36     print('candidate is:',candidate)
37     match = address.search(candidate)
38     if match:
39         print('  Name:',match.groupdict()['name'])
40         print('  Email:',match.groupdict()['email'])
41         print('  The group is:',match.group())
42         print('  The groups is:', match.groups())
43     else:
44         print('No match!')
45 
46 
47 
48 
49 if __name__ == "__main__":
50     pass

 

1.9修改字符串

 1 #version 0.1
import re
text = '''Paragraph one
on two lines.


Paragraph two.



Paragraph three.'''

for num,para in enumerate(re.findall(r'(.+?)(\n{2,}|$)',text,flags=re.DOTALL)):
    print(num,repr(para))

 

 2 import re
 3 bold = re.compile(r'\*{2}(.*?)\*{2}')
 4 text = 'Make this **bold**.   This **too**.'
 5 print('Text:',text)
 6 print('Bold:',bold.sub(r'<b>\1</b>',text))
 7 
 8 #version 0.2
 9 import re
10 bold = re.compile(r'\*{2}(?P<name>.*?)\*{2}')
11 text = 'Make this **bold**.   This **too**.'
12 print('Text:',text)
13 print('Bold:',bold.sub(r'<b>\g<name></b>',text))
14 
15 #version 0.3
16 import re
17 bold = re.compile(r'\*{2}(.*?)\*{2}')
18 text = 'Make this **bold**.   This **too**.'
19 print('Text:',text)
20 print('Bold:',bold.sub(r'<b>\1</b>',text,count=2))

 

1.10 Split

 

import re
text = '''Paragraph one
on two lines.


Paragraph two.



Paragraph three.'''

for num,para in enumerate(re.findall(r'(.+?)(\n{2,}|$)',text,flags=re.DOTALL)):
    print(num,repr(para))

 

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!