-
Notifications
You must be signed in to change notification settings - Fork 0
/
regex_methods.py
128 lines (97 loc) · 4.07 KB
/
regex_methods.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import re
###################################################################################
################################ Searching functions ##############################
###################################################################################
# re.search(<regex>, <string>, flags=0)
re.search('[a-z]+', '123FOO456', flags=re.IGNORECASE)
# re.match(<regex>, <string>, flags=0)
re.match('\d+', '123foobar') # only string beginning
re.match('\d+', 'foo123bar')
# re.fullmatch(<regex>, <string>, flags=0)
re.fullmatch('\d+', '123') # only entire string
re.fullmatch(r'\d+', 'foo123')
# re.findall(<regex>, <string>, flags=0)
re.findall('\w+', '...foo,,,,bar:%$baz//|') # all matches in list
re.findall('#(\w+)#', '#foo#.#bar#.#baz#')
re.findall('(#\w+#)', '#foo#.#bar#.#baz#')
# re.finditer(<regex>, <string>, flags=0)
it = re.finditer('\w+', '...foo,,,,bar:%$baz//|') # all matches in iterator
next(it)
next(it)
next(it)
for i in re.finditer('\w+', '...foo,,,,bar:%$baz//|'):
print(i)
###################################################################################
############################## Substitution functions #############################
###################################################################################
# re.sub(<regex>, <repl>, <string>, count=0, flags=0)
s = 'foo.123.bar.789.baz'
re.sub('\d+', '#', s)
re.sub('[a-z]+', '(*)', s)
def function10x(match_obj):
s = match_obj.group(0) # matching string
if s.isdigit(): # True if all chars are digits
return str(int(s) * 10)
else:
return s.upper()
re.sub('\w+', function10x, 'foo.10.bar.20.baz.30')
re.sub('\w+', 'xxx', 'foo.bar.baz.qux', count=2) # apply times
# re.subn(<regex>, <repl>, <string>, count=0, flags=0)
re.subn('\w+', 'xxx', 'foo.bar.baz.qux') # number of substitutions
###################################################################################
################################# Utility functions ###############################
###################################################################################
# re.split(<regex>, <string>, maxsplit=0, flags=0)
re.split('\s*[,;/]\s*', 'foo,bar ; baz / qux')
string = 'foo,bar ; baz / qux'
regex = '\s*[,;/]\s*' # comma, semicolon, slash, spaces
a = re.split(regex, string)
for i, s in enumerate(a):
if not re.fullmatch(regex, s):
a[i] = f'<{s}>'
''.join(a)
# re.escape(<regex>)
re.escape('foo^bar(baz)|qux') == 'foo\^bar\(baz\)\|qux'
# re.compile(<regex>, flags=0)
re_obj = re.compile(r'(\d+)')
re.search(re_obj, 'foo123bar')
re_obj.search('foo123bar')
re_obj = re.compile('ba[rz]', flags=re.I)
r2 = re.search(re_obj, 'FOOBARBAZ')
r3 = re_obj.search('FOOBARBAZ')
# example
'''
match() function checks for a match only at the beginning of the string (by default)
search() function checks for a match anywhere in the string
groups() function returns all the groups matched
'''
pattern = "^[a-zA-Z0-9.-]+@([a-z]+\.[a-z]+)$" # regular expression for common email validation
search_string = "[email protected]"
match = re.match(pattern, search_string) # re instance
if match:
domain = match.groups()[0]
print(domain)
# example text parsing
import requests
the_idiot_url = 'https://www.gutenberg.org/files/2638/2638-0.txt'
def get_book(url):
# sends a http request to get the text from project Gutenberg
raw = requests.get(url).text
# discards the metadata from the beginning of the book
start = re.search("\*\*\* START OF THIS PROJECT GUTENBERG EBOOK .* \*\*\*", raw).end()
# discards the metadata from the end of the book
stop = re.search("Eydkuhnen", raw).start()
# keeps the relevant text
text = raw[start:stop]
return text
def preprocess(sentence):
return re.sub('[^A-Za-z0-9.,-]+' , ' ', sentence).lower() # extract letters, numbers, comma, dot, dash
book = get_book(the_idiot_url)
processed_book = preprocess(book)
print(processed_book)
# find number of 'the'
len(re.findall('the', processed_book))
# convert i to I (space i space)
processed_book = re.sub('\si\s', " I ", processed_book)
# words with --
re.findall('[a-zA-Z0-9]*--[a-zA-Z0-9]*', processed_book)