# -*- coding: utf-8 -*- from datetime import datetime import csv import sys from miscellaneous import * from datetime import datetime import time import textacy from scipy import * import os csv.field_size_limit(sys.maxsize) FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/" # load config config_ini = FILEPATH + "config.ini" config = ConfigParser.ConfigParser() with open(config_ini) as f: config.read_file(f) global REGEX_SPECIALCHAR global REGEX_TOPLVL global THESAURUS global WORDS global LEMMAS global NOUNS global VORNAMEN global DE_STOP_WORDS global EN_STOP_WORDS REGEX_SPECIALCHAR = r'[`\-=~%^&*()_+\[\]{};\'\\:"|]' #+r',.' REGEX_TOPLVL = r'\.[a-z]{2,3}(\.[a-z]{2,3})?' THESAURUS = {} WORDS= {} LEMMAS= {} NOUNS= {} VORNAMEN= {} DE_STOP_WORDS= {} EN_STOP_WORDS= {} ############# filter tokens def keepPOS(pos_list): return lambda tok: tok.pos_ in pos_list def keepNouns(noun_list=NOUNS): return lambda tok: tok.lower_ in noun_list def removePOS(pos_list): return lambda tok: tok.pos_ not in pos_list def removeWords(words, keep=None): if hasattr(keep, '__iter__'): for k in keep: try: words.remove(k) except ValueError: pass return lambda tok: tok.lower_ not in words def keepENT(ent_list): return lambda tok: tok.ent_type_ in ent_list def removeENT(ent_list): return lambda tok: tok.ent_type_ not in ent_list def remove_words_containing_Numbers(): return lambda tok: not bool(re.search('\d', tok.lower_)) def remove_words_containing_topLVL(): return lambda tok: not bool(re.search(REGEX_TOPLVL, tok.lower_)) def remove_words_containing_specialCharacters(): return lambda tok: not bool(re.search(REGEX_SPECIALCHAR, tok.lower_)) def remove_long_words(): return lambda tok: not len(tok.lower_) < 2 def remove_short_words(): return lambda tok: not len(tok.lower_) > 35 def remove_first_names(): return lambda tok: tok.lower_ not in [name.lower() for name in VORNAMEN] ############# strings def remove_addresses(string): pass # todo def lemmatizeWord(word,lemma_dict=LEMMAS,n=3): for i in range(n): try: word = lemma_dict[word.lower()] if word.lower() in lemma_dict.keys() else word.lower() except: print(word) return word def getFirstSynonym(word, thesaurus=THESAURUS): if not isinstance(word, str): return str(word) word = word.lower() if word in thesaurus.keys(): return thesaurus[word] else: return str(word) ########################## Spellchecking ########################################## # http://norvig.com/spell-correct.html # http://wortschatz.uni-leipzig.de/en/download import re def words(text): return re.findall(r'\w+', text.lower()) def P(word, N=sum(WORDS.values())): "Probability of `word`." return WORDS[word] / N def correction(word): "Most probable spelling correction for word." return max(candidates(word), key=P) def candidates(word): "Generate possible spelling corrections for word." return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word]) def known(words): "The subset of `words` that appear in the dictionary of WORDS." return set(w for w in words if w in WORDS) def edits1(word): "All edits that are one edit away from `word`." letters = 'abcdefghijklmnopqrstuvwxyz' splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] deletes = [L + R[1:] for L, R in splits if R] transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1] replaces = [L + c + R[1:] for L, R in splits if R for c in letters] inserts = [L + c + R for L, R in splits for c in letters] return set(deletes + transposes + replaces + inserts) def edits2(word): "All edits that are two edits away from `word`." return (e2 for e1 in edits1(word) for e2 in edits1(e1)) def autocorrectWord(word): try: return correction(word) except: return word ############# stringcleaning @deprecated def stringcleaning(stringstream): for string in stringstream: string = string.lower() # fixUnicode string = textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC') # remove_words_containing_topLVL string = " ".join([w.lower() for w in string.split() if not re.search(REGEX_TOPLVL, w)]) # replaceRockDots string = re.sub(r'[ß]', "ss", string) string = re.sub(r'[ö]', "oe", string) string = re.sub(r'[ü]', "ue", string) string = re.sub(r'[ä]', "ae", string) # seperate_words_on_regex: string = " ".join(re.compile(REGEX_SPECIALCHAR).split(string)) # cut_after word = "gruss" #idee addressen enfernen --> postal.parser string = string.rpartition(word)[0] if word in string else string # lemmatize string = " ".join([lemmatizeWord(word) for word in string.split()]) # synonyme normalisieren #idee vor oder nach lemmatize? string = " ".join([getFirstSynonym(word) for word in string.split()]) # autocorrect string = " ".join([autocorrectWord(word) for word in string.split()]) yield string def filterTokens(tokens, funclist): # in:tokenlist, funclist # out: tokenlist for f in funclist: tokens = list(filter(f, tokens)) return tokens def processContentstream2(textstream, parser, token_filterlist=None): #pre parse textstream = preparse(textstream) pipe = parser.pipe(textstream) for doc in pipe: tokens = [tok for tok in doc] # in parse if token_filterlist is not None: tokens = filterTokens(tokens, token_filterlist) # post parse tokens = [postparse(tok) for tok in tokens] #todo informationsverlust von pos,tag etc.! yield " ".join(tokens) def preparse(stringstream): for string in stringstream: # cut_after # todo addressen enfernen --> postal.parser idee zu metadaten hinzufügen words = ["gruss", "grusse","gruesse","gruessen","grusses"] for gr in words: if gr in string: string = string.rpartition(gr)[0] break yield string def postparse(toktext): """ :param toktext: spacy.token :return: string """ toktext = toktext.lower_ # remove_words_containing_topLVL toktext = toktext if not re.search(REGEX_TOPLVL, toktext) else "" # lemmatize toktext = lemmatizeWord(toktext) # synonyme normalisieren toktext = getFirstSynonym(toktext) # autocorrect toktext = autocorrectWord(toktext) return toktext def corpus2Text(corpus): for doc in corpus: yield doc.text def corpus2Meta(corpus): for doc in corpus: yield doc.metadata @deprecated def processContentstream(textstream, parser, token_filterlist=None): """ :param textstream: string-gen :param funclist: [func] :param parser: spacy-parser :return: string-gen """ # pre_parse textstream = stringcleaning(textstream) pipe = parser.pipe(textstream) tokens = [] for doc in pipe: tokens = [tok for tok in doc] # in_parse if token_filterlist is not None: tokens = filterTokens(tokens, token_filterlist) yield " ".join([tok.lower_ for tok in tokens]) # yield " ".join(list(set([tok.lower_ for tok in tokens]))) def processDictstream(dictstream, funcdict, parser): """ :param dictstream: dict-gen :param funcdict: clean_in_meta = { "Solution":funclist, ... } :param parser: spacy-parser :return: dict-gen """ for dic in dictstream: result = {} for key, value in dic.items(): if key in funcdict: doc = parser(value) tokens = [tok for tok in doc] funclist = funcdict[key] tokens = filterTokens(tokens, funclist) result[key] = " ".join([tok.lower_ for tok in tokens]) else: result[key] = value yield result ################################################################################################## # ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/preprocessing.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_preprocessing.log &" path2thesaurus_dict = FILEPATH + config.get("thesaurus","pickle_file") path2wordsdict = FILEPATH + config.get("spellchecking", "pickle_file") path2lemmadict = FILEPATH + config.get("lemmatization","pickle_file") path2nouns_list = FILEPATH + config.get("nouns","pickle_file") path2firstnameslist = FILEPATH + config.get("firstnames","pickle_file") path2DEstopwordlist = FILEPATH + config.get("de_stopwords", "pickle_file") path2ENstopwordlist = FILEPATH + config.get("en_stopwords", "pickle_file") corpus_de_path = FILEPATH + config.get("de_corpus", "path") corpus_en_path = FILEPATH + config.get("en_corpus", "path") def preprocessCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", printrandom=10): logprint("Preprocess {0}_corpus at {1}".format(lang, datetime.now())) cleanCorpus_name = lang + "_clean_ticket" preCorpus_name = lang + "_pre_ticket" logprint("Load {0}_raw".format(lang)) #load raw corpus and create new one clean_corpus, parser = load_corpus(corpus_name=cleanCorpus_name, corpus_path=corpus_path) corpus = textacy.Corpus(parser) ## process and add files to textacy-corpi, corpus.add_texts( processContentstream2(corpus2Text(clean_corpus), token_filterlist=filter_tokens, parser=parser), processDictstream(corpus2Meta(clean_corpus), clean_in_meta,parser=parser) ) # leere docs aus corpi kicken corpus.remove(lambda doc: len(doc) == 0) for i in range(printrandom): printRandomDoc(corpus) #save corpus save_corpus(corpus=corpus, corpus_path=corpus_path, corpus_name=preCorpus_name) #save corpus as labled, plain text plainpath = FILEPATH + config.get("de_corpus", "path") + "pre_labled_lines.txt" textacy.fileio.write_file_lines(labledCorpiLines(corpus),filepath=plainpath ) return corpus def labledCorpiLines(corpus): for doc in corpus: # generate [topic1, topic2....] tok1 tok2 tok3 out of corpi yield "[" + doc.metadata["categoryName"] + "] " + doc.text def main(): start = time.time() THESAURUS = load_obj(path2thesaurus_dict) WORDS = load_obj(path2wordsdict) LEMMAS = load_obj(path2lemmadict) DE_STOP_WORDS = load_obj(path2DEstopwordlist) EN_STOP_WORDS = load_obj(path2ENstopwordlist) NOUNS = load_obj(path2nouns_list) VORNAMEN = load_obj(path2firstnameslist) custom_words = config.get("preprocessing","custom_words").split(",") filter_tokens = [ # removeENT(["PERSON"]), keepNouns(NOUNS), remove_words_containing_Numbers(), removePOS(["PUNCT", "SPACE", "NUM"]), removeWords(DE_STOP_WORDS + custom_words), #removeWords(DE_STOP_WORDS), remove_long_words(), remove_short_words(), remove_first_names() ] clean_in_meta = { "Solution": [removePOS(["SPACE"])], "Subject": [removePOS(["SPACE", "PUNCT"])], "categoryName": [removePOS(["SPACE", "PUNCT"])] } corpus = preprocessCorpus(corpus_de_path, filter_tokens, clean_in_meta, "de",printrandom=5) #from topicModeling import jgibbsLLDA #jgibbsLLDA(corpus) #preprocessCorpus(corpus_en_path, filter_tokens, clean_in_meta, "en" ) end = time.time() logprint("Time Elapsed Preprocessing:{0} min".format((end - start) / 60)) if __name__ == "__main__": main() """ pipe=[ ##String fixUnicode(), replaceHardS(), resolveAbbrivations(), remove_words_containing_topLVL(), replaceSpecialChars(" "), (mit Leerzeichen erstzen, dadruch werden Terme wie 8203;verfügung getrennt remove_words_containing_Numbers(), ##spacyParse removeENT("PERSON"), keepPOS(["NOUN"]), #ODER lemmatize(), removeWords(de_stop_words + config.get("preprocessing","custom_words").split(",")), # evtl. spellCorrection(), keepUniqeTokens(), ] """ """ filter_tokens=[ #removeENT(["PERSON"]), #idee addressen enfernen #bisher mit cut_after("gruss") --> postal.parser #idee rechtschreibkorrektur --> PyEnchant #idee thesaurus --> WordNet, eigener remove_words_containing_Numbers(), removePOS(["PUNCT","SPACE","NUM"]), removeWords(de_stop_words+custom_words), remove_long_words(), remove_short_words(), remove_first_names(), keepPOS(["NOUN"]), ] """