textcleaning macht immer noch keinen spass

This commit is contained in:
jannis.grundmann 2017-09-19 14:42:38 +02:00
parent 20d9eed5b3
commit a380b57bfc
3 changed files with 139 additions and 41 deletions

View File

@ -456,7 +456,7 @@ vllt kategorien in unterkategorien aufteilen
allg: allg:
utf-korregieren, bei sonderzeichen wörter trennen utf-korregieren, bei sonderzeichen wörter trennen
namen raus namen raus, addressen nach grüßen
emails, urls, nummern raus emails, urls, nummern raus
vllt sogar alles, was ebend jenes enthält (oder auf .toplvldomain bzw. sonderzeichen enthält oder alles was ein @ enthält vllt sogar alles, was ebend jenes enthält (oder auf .toplvldomain bzw. sonderzeichen enthält oder alles was ein @ enthält

View File

@ -187,6 +187,13 @@ def remove_short_words():
def remove_first_names(): def remove_first_names():
return lambda tok: tok.lower_ not in [name.lower() for name in VORNAMEN] return lambda tok: tok.lower_ not in [name.lower() for name in VORNAMEN]
#falls wort nicht in vocab, erst schauen ob teilwort bekannt ist, falls ja, schauen ob es davor oder danach bullshit stehen hat. ggf trennen
############# strings ############# strings
def stringcleaning(stringstream, funclist): def stringcleaning(stringstream, funclist):
@ -196,6 +203,10 @@ def stringcleaning(stringstream, funclist):
yield string yield string
def seperate_words_on_regex(regex=regex_specialChars):
return lambda string: " ".join(re.compile(regex).split(string))
def remove_words_containing_topLVL(): def remove_words_containing_topLVL():
return lambda string: " ".join([w.lower() for w in string.split() if not re.search(regex_topLvl, w) ]) return lambda string: " ".join([w.lower() for w in string.split() if not re.search(regex_topLvl, w) ])
@ -212,10 +223,15 @@ def replacePhonenumbers(replace_with="PHONENUMBER"):
return lambda string: textacy.preprocess.replace_phone_numbers(string.lower(), replace_with=replace_with) return lambda string: textacy.preprocess.replace_phone_numbers(string.lower(), replace_with=replace_with)
def replaceHardS(replace_with="ss"): def replaceSharpS(replace_with="ss"):
return lambda string: re.sub(r'[ß]',replace_with,string.lower()) return lambda string: re.sub(r'[ß]',replace_with,string.lower())
def replaceRockDots():
return lambda string: re.sub(r'[ß]', "ss", (re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower())))))))
def fixUnicode(): def fixUnicode():
return lambda string: textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC') return lambda string: textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC')
@ -230,28 +246,36 @@ def lemmatizeWord(word,filepath=LEMMAS):
def lemmatize(): def lemmatize():
#todo https://alpha.spacy.io/docs/usage/adding-languages#lemmatizer #todo https://alpha.spacy.io/docs/usage/adding-languages#lemmatizer
return lambda tok: lemmatizeWord(tok.lower_) return lambda doc: " ".join([lemmatizeWord(tok.lower_) for tok in doc])
def processTextstream(textstream, string_funclist, tok_funclist,parser=DE_PARSER): def processTextstream(textstream, string_funclist, tok_funclist, parser=DE_PARSER, single_doc_func=None):
""" """
:param textstream: string-gen :param textstream: string-gen
:param funclist: [func] :param funclist: [func]
:param parser: spacy-parser :param parser: spacy-parser
:return: string-gen :return: string-gen
""" """
#zuerst die string-methoden
pipe = parser.pipe(stringcleaning(textstream,string_funclist)) pipe = parser.pipe(stringcleaning(textstream,string_funclist))
tokens=[]
for doc in pipe: for doc in pipe:
tokens = [tok for tok in doc] tokens = [tok for tok in doc]
tokens = processTokens(tokens,tok_funclist,parser) #dann die auf tokens
tokens = processTokens(tokens,tok_funclist)
yield " ".join([tok.lower_ for tok in tokens]) if single_doc_func is not None:
yield single_doc_func(parser(" ".join([tok.lower_ for tok in tokens])))
else:
yield " ".join([tok.lower_ for tok in tokens])
def processTokens(tokens, funclist, parser):
def processTokens(tokens, funclist):
# in:tokenlist, funclist # in:tokenlist, funclist
# out: tokenlist # out: tokenlist
for f in funclist: for f in funclist:
@ -261,25 +285,33 @@ def processTokens(tokens, funclist, parser):
return tokens return tokens
string_comp=[ string_comp=[
replaceHardS(), fixUnicode(),
replaceRockDots(),
remove_words_containing_topLVL(), remove_words_containing_topLVL(),
replaceSpecialChars(), seperate_words_on_regex()
] ]
tok_comp=[ tok_comp=[
removeENT(["PERSON"]), #removeENT(["PERSON"]),
remove_words_containing_Numbers(), remove_words_containing_Numbers(),
#keepPOS(["NOUN"]),
removePOS(["PUNCT","SPACE","NUM"]), removePOS(["PUNCT","SPACE","NUM"]),
removeWords(de_stop_words), removeWords(de_stop_words),
remove_long_words(), remove_long_words(),
remove_short_words(), remove_short_words(),
remove_first_names() remove_first_names(),
#keepPOS(["NOUN"]),
] ]
single_doc_func = lemmatize()
""" """
@ -332,7 +364,7 @@ ticketcorpus = textacy.Corpus(DE_PARSER)
## add files to textacy-corpus, ## add files to textacy-corpus,
printlog("add texts to textacy-corpus") printlog("add texts to textacy-corpus")
ticketcorpus.add_texts( ticketcorpus.add_texts(
processTextstream(csv_to_contentStream(path2csv,"Description"), string_funclist=string_comp,tok_funclist=tok_comp) processTextstream(csv_to_contentStream(path2csv,"Description"), string_funclist=string_comp, tok_funclist=tok_comp, single_doc_func=single_doc_func)
) )
for i in range(10): for i in range(10):
@ -362,33 +394,6 @@ for i in range(10):
words = [
"uniaccount",
"nr54065467",
"nr54065467",
"455a33c5,"
"tvt?=",
"tanja.saborowski@tu-dortmund.de",
"-",
"m-sw1-vl4053.itmc.tu-dortmund.de",
"------problem--------"
]
topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)
for w in words:
print(stringcleaning(w,string_comp))
#print(bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?',w)))
#print(bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]',w)))
#result = specialFinder.sub(" ", w)
#print(re.sub(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]'," ",w))
#print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w))
""" """

93
testra.py Normal file
View File

@ -0,0 +1,93 @@
# -*- coding: utf-8 -*-
import corenlp as corenlp
import os
import re
import textacy
import nltk
from textblob_de import TextBlobDE
from textblob_de import PatternParser
filepath = "lemmatization-de.txt"
blob = TextBlobDE(str(textacy.fileio.read_file("teststring.txt")),parser=PatternParser(pprint=True, lemmata=True))
print(blob.parse())
#erste spalte zu {lemma : id} . zweite spalte zu {word : id}
"""http://www.lexiconista.com/datasets/lemmatization/"""
lemma2id = {}
word2id = {}
for id,line in enumerate(list(textacy.fileio.read_file_lines(filepath=filepath))):
lemma = line.split()[0].strip().lower()
if lemma not in lemma2id:
lemma2id[lemma] = id
word = line.split()[1].strip().lower()
word2id[word] = lemma2id[word]
"""
regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|</>?]'
def stringcleaning(stringstream, funclist):
for string in stringstream:
for f in funclist:
string = f(string)
yield string
def seperate_words_on_regex(regex=regex_specialChars):
return lambda string: " ".join(re.compile(regex).split(string))
words = [
"uniaccount",
"nr54065467",
"nr54065467",
"455a33c5,"
"tvt?=",
"tanja.saborowski@tu-dortmund.de",
"-",
"m-sw1-vl4053.itmc.tu-dortmund.de",
"------problem--------"
]
topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)
for s in stringcleaning((w for w in words),[seperate_words_on_regex()]):
print(s.strip())
#print(stringcleaning(w,string_comp))
#print(bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?',w)))
#print(bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]',w)))
#result = specialFinder.sub(" ", w)
#print(re.sub(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]'," ",w))
#print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w))
"""