textcleaning macht immer noch keinen spass

This commit is contained in:
jannis.grundmann 2017-09-19 14:42:38 +02:00
parent 20d9eed5b3
commit a380b57bfc
3 changed files with 139 additions and 41 deletions

View File

@ -456,7 +456,7 @@ vllt kategorien in unterkategorien aufteilen
allg:
utf-korregieren, bei sonderzeichen wörter trennen
namen raus
namen raus, addressen nach grüßen
emails, urls, nummern raus
vllt sogar alles, was ebend jenes enthält (oder auf .toplvldomain bzw. sonderzeichen enthält oder alles was ein @ enthält

View File

@ -187,6 +187,13 @@ def remove_short_words():
def remove_first_names():
return lambda tok: tok.lower_ not in [name.lower() for name in VORNAMEN]
#falls wort nicht in vocab, erst schauen ob teilwort bekannt ist, falls ja, schauen ob es davor oder danach bullshit stehen hat. ggf trennen
############# strings
def stringcleaning(stringstream, funclist):
@ -196,6 +203,10 @@ def stringcleaning(stringstream, funclist):
yield string
def seperate_words_on_regex(regex=regex_specialChars):
return lambda string: " ".join(re.compile(regex).split(string))
def remove_words_containing_topLVL():
return lambda string: " ".join([w.lower() for w in string.split() if not re.search(regex_topLvl, w) ])
@ -212,10 +223,15 @@ def replacePhonenumbers(replace_with="PHONENUMBER"):
return lambda string: textacy.preprocess.replace_phone_numbers(string.lower(), replace_with=replace_with)
def replaceHardS(replace_with="ss"):
def replaceSharpS(replace_with="ss"):
return lambda string: re.sub(r'[ß]',replace_with,string.lower())
def replaceRockDots():
return lambda string: re.sub(r'[ß]', "ss", (re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower())))))))
def fixUnicode():
return lambda string: textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC')
@ -230,28 +246,36 @@ def lemmatizeWord(word,filepath=LEMMAS):
def lemmatize():
#todo https://alpha.spacy.io/docs/usage/adding-languages#lemmatizer
return lambda tok: lemmatizeWord(tok.lower_)
return lambda doc: " ".join([lemmatizeWord(tok.lower_) for tok in doc])
def processTextstream(textstream, string_funclist, tok_funclist,parser=DE_PARSER):
def processTextstream(textstream, string_funclist, tok_funclist, parser=DE_PARSER, single_doc_func=None):
"""
:param textstream: string-gen
:param funclist: [func]
:param parser: spacy-parser
:return: string-gen
"""
#zuerst die string-methoden
pipe = parser.pipe(stringcleaning(textstream,string_funclist))
tokens=[]
for doc in pipe:
tokens = [tok for tok in doc]
tokens = processTokens(tokens,tok_funclist,parser)
#dann die auf tokens
tokens = processTokens(tokens,tok_funclist)
if single_doc_func is not None:
yield single_doc_func(parser(" ".join([tok.lower_ for tok in tokens])))
else:
yield " ".join([tok.lower_ for tok in tokens])
def processTokens(tokens, funclist, parser):
def processTokens(tokens, funclist):
# in:tokenlist, funclist
# out: tokenlist
for f in funclist:
@ -261,25 +285,33 @@ def processTokens(tokens, funclist, parser):
return tokens
string_comp=[
replaceHardS(),
fixUnicode(),
replaceRockDots(),
remove_words_containing_topLVL(),
replaceSpecialChars(),
seperate_words_on_regex()
]
tok_comp=[
removeENT(["PERSON"]),
#removeENT(["PERSON"]),
remove_words_containing_Numbers(),
#keepPOS(["NOUN"]),
removePOS(["PUNCT","SPACE","NUM"]),
removeWords(de_stop_words),
remove_long_words(),
remove_short_words(),
remove_first_names()
remove_first_names(),
#keepPOS(["NOUN"]),
]
single_doc_func = lemmatize()
"""
@ -332,7 +364,7 @@ ticketcorpus = textacy.Corpus(DE_PARSER)
## add files to textacy-corpus,
printlog("add texts to textacy-corpus")
ticketcorpus.add_texts(
processTextstream(csv_to_contentStream(path2csv,"Description"), string_funclist=string_comp,tok_funclist=tok_comp)
processTextstream(csv_to_contentStream(path2csv,"Description"), string_funclist=string_comp, tok_funclist=tok_comp, single_doc_func=single_doc_func)
)
for i in range(10):
@ -362,33 +394,6 @@ for i in range(10):
words = [
"uniaccount",
"nr54065467",
"nr54065467",
"455a33c5,"
"tvt?=",
"tanja.saborowski@tu-dortmund.de",
"-",
"m-sw1-vl4053.itmc.tu-dortmund.de",
"------problem--------"
]
topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)
for w in words:
print(stringcleaning(w,string_comp))
#print(bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?',w)))
#print(bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]',w)))
#result = specialFinder.sub(" ", w)
#print(re.sub(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]'," ",w))
#print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w))
"""

93
testra.py Normal file
View File

@ -0,0 +1,93 @@
# -*- coding: utf-8 -*-
import corenlp as corenlp
import os
import re
import textacy
import nltk
from textblob_de import TextBlobDE
from textblob_de import PatternParser
filepath = "lemmatization-de.txt"
blob = TextBlobDE(str(textacy.fileio.read_file("teststring.txt")),parser=PatternParser(pprint=True, lemmata=True))
print(blob.parse())
#erste spalte zu {lemma : id} . zweite spalte zu {word : id}
"""http://www.lexiconista.com/datasets/lemmatization/"""
lemma2id = {}
word2id = {}
for id,line in enumerate(list(textacy.fileio.read_file_lines(filepath=filepath))):
lemma = line.split()[0].strip().lower()
if lemma not in lemma2id:
lemma2id[lemma] = id
word = line.split()[1].strip().lower()
word2id[word] = lemma2id[word]
"""
regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|</>?]'
def stringcleaning(stringstream, funclist):
for string in stringstream:
for f in funclist:
string = f(string)
yield string
def seperate_words_on_regex(regex=regex_specialChars):
return lambda string: " ".join(re.compile(regex).split(string))
words = [
"uniaccount",
"nr54065467",
"nr54065467",
"455a33c5,"
"tvt?=",
"tanja.saborowski@tu-dortmund.de",
"-",
"m-sw1-vl4053.itmc.tu-dortmund.de",
"------problem--------"
]
topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)
for s in stringcleaning((w for w in words),[seperate_words_on_regex()]):
print(s.strip())
#print(stringcleaning(w,string_comp))
#print(bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?',w)))
#print(bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]',w)))
#result = specialFinder.sub(" ", w)
#print(re.sub(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]'," ",w))
#print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w))
"""