From a380b57bfc137ed84bfc48e034cf8e32384540cc Mon Sep 17 00:00:00 2001 From: "jannis.grundmann" Date: Tue, 19 Sep 2017 14:42:38 +0200 Subject: [PATCH] textcleaning macht immer noch keinen spass --- test.py | 2 +- testo.py | 85 ++++++++++++++++++++++++++------------------------ testra.py | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 139 insertions(+), 41 deletions(-) create mode 100644 testra.py diff --git a/test.py b/test.py index 6992c3c..8820f61 100644 --- a/test.py +++ b/test.py @@ -456,7 +456,7 @@ vllt kategorien in unterkategorien aufteilen allg: utf-korregieren, bei sonderzeichen wörter trennen -namen raus +namen raus, addressen nach grüßen emails, urls, nummern raus vllt sogar alles, was ebend jenes enthält (oder auf .toplvldomain bzw. sonderzeichen enthält oder alles was ein @ enthält diff --git a/testo.py b/testo.py index d6e9e9d..a88bb85 100644 --- a/testo.py +++ b/testo.py @@ -187,6 +187,13 @@ def remove_short_words(): def remove_first_names(): return lambda tok: tok.lower_ not in [name.lower() for name in VORNAMEN] + + +#falls wort nicht in vocab, erst schauen ob teilwort bekannt ist, falls ja, schauen ob es davor oder danach bullshit stehen hat. ggf trennen + + + + ############# strings def stringcleaning(stringstream, funclist): @@ -196,6 +203,10 @@ def stringcleaning(stringstream, funclist): yield string +def seperate_words_on_regex(regex=regex_specialChars): + return lambda string: " ".join(re.compile(regex).split(string)) + + def remove_words_containing_topLVL(): return lambda string: " ".join([w.lower() for w in string.split() if not re.search(regex_topLvl, w) ]) @@ -212,10 +223,15 @@ def replacePhonenumbers(replace_with="PHONENUMBER"): return lambda string: textacy.preprocess.replace_phone_numbers(string.lower(), replace_with=replace_with) -def replaceHardS(replace_with="ss"): +def replaceSharpS(replace_with="ss"): return lambda string: re.sub(r'[ß]',replace_with,string.lower()) +def replaceRockDots(): + return lambda string: re.sub(r'[ß]', "ss", (re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower()))))))) + + + def fixUnicode(): return lambda string: textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC') @@ -230,28 +246,36 @@ def lemmatizeWord(word,filepath=LEMMAS): def lemmatize(): #todo https://alpha.spacy.io/docs/usage/adding-languages#lemmatizer - return lambda tok: lemmatizeWord(tok.lower_) + return lambda doc: " ".join([lemmatizeWord(tok.lower_) for tok in doc]) -def processTextstream(textstream, string_funclist, tok_funclist,parser=DE_PARSER): +def processTextstream(textstream, string_funclist, tok_funclist, parser=DE_PARSER, single_doc_func=None): """ :param textstream: string-gen :param funclist: [func] :param parser: spacy-parser :return: string-gen """ + #zuerst die string-methoden pipe = parser.pipe(stringcleaning(textstream,string_funclist)) - + tokens=[] for doc in pipe: tokens = [tok for tok in doc] - tokens = processTokens(tokens,tok_funclist,parser) + #dann die auf tokens + tokens = processTokens(tokens,tok_funclist) - yield " ".join([tok.lower_ for tok in tokens]) + if single_doc_func is not None: + yield single_doc_func(parser(" ".join([tok.lower_ for tok in tokens]))) + else: + yield " ".join([tok.lower_ for tok in tokens]) -def processTokens(tokens, funclist, parser): + + + +def processTokens(tokens, funclist): # in:tokenlist, funclist # out: tokenlist for f in funclist: @@ -261,25 +285,33 @@ def processTokens(tokens, funclist, parser): return tokens + + string_comp=[ - replaceHardS(), + fixUnicode(), + replaceRockDots(), remove_words_containing_topLVL(), - replaceSpecialChars(), + seperate_words_on_regex() ] tok_comp=[ - removeENT(["PERSON"]), + #removeENT(["PERSON"]), remove_words_containing_Numbers(), - #keepPOS(["NOUN"]), removePOS(["PUNCT","SPACE","NUM"]), removeWords(de_stop_words), remove_long_words(), remove_short_words(), - remove_first_names() + remove_first_names(), + + #keepPOS(["NOUN"]), + ] +single_doc_func = lemmatize() + + """ @@ -332,7 +364,7 @@ ticketcorpus = textacy.Corpus(DE_PARSER) ## add files to textacy-corpus, printlog("add texts to textacy-corpus") ticketcorpus.add_texts( - processTextstream(csv_to_contentStream(path2csv,"Description"), string_funclist=string_comp,tok_funclist=tok_comp) + processTextstream(csv_to_contentStream(path2csv,"Description"), string_funclist=string_comp, tok_funclist=tok_comp, single_doc_func=single_doc_func) ) for i in range(10): @@ -362,33 +394,6 @@ for i in range(10): - - - - -words = [ - "uniaccount", - "nr54065467", - "nr54065467", - "455a33c5," - "tvt?=", - "tanja.saborowski@tu-dortmund.de", - "-", - "m-sw1-vl4053.itmc.tu-dortmund.de", - "------problem--------" -] - -topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE) -specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE) - -for w in words: - print(stringcleaning(w,string_comp)) - #print(bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?',w))) - #print(bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]',w))) - #result = specialFinder.sub(" ", w) - #print(re.sub(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]'," ",w)) - - #print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w)) """ diff --git a/testra.py b/testra.py new file mode 100644 index 0000000..2850c1f --- /dev/null +++ b/testra.py @@ -0,0 +1,93 @@ +# -*- coding: utf-8 -*- +import corenlp as corenlp +import os +import re +import textacy +import nltk +from textblob_de import TextBlobDE +from textblob_de import PatternParser + +filepath = "lemmatization-de.txt" + + + + +blob = TextBlobDE(str(textacy.fileio.read_file("teststring.txt")),parser=PatternParser(pprint=True, lemmata=True)) + +print(blob.parse()) + + + + + +#erste spalte zu {lemma : id} . zweite spalte zu {word : id} + + + +"""http://www.lexiconista.com/datasets/lemmatization/""" + +lemma2id = {} +word2id = {} + +for id,line in enumerate(list(textacy.fileio.read_file_lines(filepath=filepath))): + + lemma = line.split()[0].strip().lower() + if lemma not in lemma2id: + lemma2id[lemma] = id + + word = line.split()[1].strip().lower() + + word2id[word] = lemma2id[word] + + + + + + + + + + +""" +regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|?]' + + +def stringcleaning(stringstream, funclist): + for string in stringstream: + for f in funclist: + string = f(string) + yield string + + +def seperate_words_on_regex(regex=regex_specialChars): + return lambda string: " ".join(re.compile(regex).split(string)) + + +words = [ + "uniaccount", + "nr54065467", + "nr54065467", + "455a33c5," + "tvt?=", + "tanja.saborowski@tu-dortmund.de", + "-", + "m-sw1-vl4053.itmc.tu-dortmund.de", + "------problem--------" +] + + + +topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE) +specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE) + +for s in stringcleaning((w for w in words),[seperate_words_on_regex()]): + print(s.strip()) + + #print(stringcleaning(w,string_comp)) + #print(bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?',w))) + #print(bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]',w))) + #result = specialFinder.sub(" ", w) + #print(re.sub(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]'," ",w)) + + #print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w)) +""" \ No newline at end of file