-
Notifications
You must be signed in to change notification settings - Fork 0
/
nlp_preprocessing.py
58 lines (40 loc) · 1.29 KB
/
nlp_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import string
import nltk
from nltk.corpus import stopwords
# nltk.download('stopwords')
import spacy
nlp = spacy.load('de_core_news_md')
def nlp_preprocess_text(text: str, rmStopwords: bool = True, rmPunctation: bool = True,
lemmatizeText: bool = True) -> str:
"""
main function to do all nlp preprocessing steps to a given text
"""
if rmPunctation:
text = removePunctation(text)
if rmStopwords:
text = removeStopwords(text)
if lemmatizeText:
text = lemmatizeSentence(text)
return text
def removeStopwords(comment: str):
"""
Remove stopwords like 'aber', 'und', ...
"""
stops = set(stopwords.words("german"))
return " ".join([word for word in comment.split() if word not in stops])
def removePunctation(text: str):
"""
Remove sentence endings like '.' or '!'
"""
nopunct = [char for char in text if char not in string.punctuation]
return ''.join(nopunct)
def tokenize(text: str):
"""
Split a sentence into words
"""
return nltk.word_tokenize(text)
def lemmatizeSentence(comment: str):
"""
Lemmatization of words - convert e.g. "Ich war mal größer" to -> "Ich sein mal groß"
"""
return ' '.join([token.lemma_ for token in nlp(comment)])