# -*- coding: utf-8 -*- from datetime import datetime print(datetime.now()) path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv" path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv" #path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_mini.csv" path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv" path2en_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/en_tickets.csv" #idee roh-corpus (nur whitespace weg) speichern -> pregeprocesster corpus -> damit arbeiten path_csv_split = path2de_csv.split("/") print(path_csv_split[len(path_csv_split) - 1]) path_csv_split = path2en_csv.split("/") print(path_csv_split[len(path_csv_split) - 1]) import time start = time.time() import logging from nltk.corpus import stopwords import csv import functools import re import xml.etree.ElementTree as ET import spacy import textacy from scipy import * import sys csv.field_size_limit(sys.maxsize) # ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/preprocessing.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_preprocessing.log &" # todo configuration file ? """ config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini" config = ConfigParser.ConfigParser() with open(config_ini) as f: config.read_file(f) """ # config logging logfile = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log" logging.basicConfig(filename=logfile, level=logging.INFO) # logging.basicConfig(filename=config.get("filepath","logfile"), level=logging.INFO) # THESAURUS # thesauruspath = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/openthesaurus.csv" # thesauruspath = config.get("filepath","thesauruspath") # THESAURUS = list(textacy.fileio.read_csv(thesauruspath, delimiter=";")) lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml" synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml" # SPELLCHECKING path2words = '/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt' DE_PARSER = spacy.load("de") EN_PARSER = spacy.load("en") """ de_stop_words= set( list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS) + list(textacy.fileio.read_file_lines("stopwords-de.txt")) ) LEMMAS = list(textacy.fileio.read_file_lines(filepath="lemmatization-de.txt")) VORNAMEN = list(textacy.fileio.read_file_lines("vornamen.txt")) """ de_stop_words = list(map(textacy.preprocess.normalize_whitespace, textacy.fileio.read_file_lines( "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stop_words.txt"))) + list(set(stopwords.words('german'))) en_stop_words= set(list(__import__("spacy." + EN_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)+ list(set(stopwords.words('english')))) LEMMAS = list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmas.txt")) VORNAMEN = list(map(textacy.preprocess.normalize_whitespace, textacy.fileio.read_file_lines( "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames.txt"))) NOUNS = list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen2.txt")) + list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen.txt")) NOUNS = list(map(textacy.preprocess.normalize_whitespace, NOUNS)) """ print(de_stop_words[10:30]) print(LEMMAS[10:30]) print(VORNAMEN[10:30]) print(NOUNS[10:30]) """ mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE) emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE) urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE) topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE) specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE) hardSFinder = re.compile(r'[ß]', re.IGNORECASE) import pickle def save_obj(obj, path): with open(path + '.pkl', 'wb') as f: pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) def load_obj(path ): with open(path + '.pkl', 'rb') as f: return pickle.load(f) def printlog(string, level="INFO"): """log and prints""" print(string) if level == "INFO": logging.info(string) elif level == "DEBUG": logging.debug(string) elif level == "WARNING": logging.warning(string) printlog("Load functions") def compose(*functions): def compose2(f, g): return lambda x: f(g(x)) return functools.reduce(compose2, functions, lambda x: x) def get_calling_function(): """finds the calling function in many decent cases. https://stackoverflow.com/questions/39078467/python-how-to-get-the-calling-function-not-just-its-name """ fr = sys._getframe(1) # inspect.stack()[1][0] co = fr.f_code for get in ( lambda: fr.f_globals[co.co_name], lambda: getattr(fr.f_locals['self'], co.co_name), lambda: getattr(fr.f_locals['cls'], co.co_name), lambda: fr.f_back.f_locals[co.co_name], # nested lambda: fr.f_back.f_locals['func'], # decorators lambda: fr.f_back.f_locals['meth'], lambda: fr.f_back.f_locals['f'], ): try: func = get() except (KeyError, AttributeError): pass else: if func.__code__ == co: return func raise AttributeError("func not found") def printRandomDoc(textacyCorpus): import random print() printlog("len(textacyCorpus) = %i" % len(textacyCorpus)) randIndex = int((len(textacyCorpus) - 1) * random.random()) printlog("Index: {0} ; Text: {1} ; Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata)) print() def csv_to_contentStream(path2csv: str, content_collumn_name: str): """ :param path2csv: string :param content_collumn_name: string :return: string-generator """ stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8') content_collumn = 0 # standardvalue for i, lst in enumerate(stream): if i == 0: # look for desired column for j, col in enumerate(lst): if col == content_collumn_name: content_collumn = j else: yield lst[content_collumn] def csv_to_metaStream(path2csv: str, metalist: [str]): """ :param path2csv: string :param metalist: list of strings :return: dict-generator """ stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8') content_collumn = 0 # standardvalue metaindices = [] metadata_temp = {} for i, lst in enumerate(stream): if i == 0: for j, col in enumerate(lst): # geht bestimmt effizienter... egal, weil passiert nur einmal for key in metalist: if key == col: metaindices.append(j) metadata_temp = dict( zip(metalist, metaindices)) # zB {'Subject' : 1, 'categoryName' : 3, 'Solution' : 10} else: metadata = metadata_temp.copy() for key, value in metadata.items(): metadata[key] = lst[value] yield metadata def save_corpus(corpus, corpus_path, corpus_name, parser=DE_PARSER): # save stringstore stringstore_path = corpus_path + corpus_name + '_strings.json' with open(stringstore_path, "w") as file: parser.vocab.strings.dump(file) # save content contentpath = corpus_path + corpus_name + "_content.bin" textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus), contentpath) # save meta metapath = corpus_path + corpus_name + "_meta.json" textacy.fileio.write_json_lines((doc.metadata for doc in corpus), metapath) ############# filter tokens def keepPOS(pos_list): return lambda tok: tok.pos_ in pos_list def keepNouns(noun_list=NOUNS): return lambda tok: tok.lower_ in noun_list def removePOS(pos_list): return lambda tok: tok.pos_ not in pos_list def removeWords(words, keep=None): if hasattr(keep, '__iter__'): for k in keep: try: words.remove(k) except ValueError: pass return lambda tok: tok.lower_ not in words def keepENT(ent_list): return lambda tok: tok.ent_type_ in ent_list def removeENT(ent_list): return lambda tok: tok.ent_type_ not in ent_list def remove_words_containing_Numbers(): return lambda tok: not bool(re.search('\d', tok.lower_)) """ def remove_words_containing_topLVL(): return lambda tok: not bool(re.search(regex_topLvl, tok.lower_)) def remove_words_containing_specialCharacters(): return lambda tok: not bool(re.search(regex_specialChars, tok.lower_)) """ def remove_long_words(): return lambda tok: not len(tok.lower_) < 2 def remove_short_words(): return lambda tok: not len(tok.lower_) > 35 def remove_first_names(): return lambda tok: tok.lower_ not in [name.lower() for name in VORNAMEN] ############# strings def replaceRockDots(): return lambda string: re.sub(r'[ß]', "ss", (re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower()))))))) def remove_addresses(string): pass # todo """ def stringcleaning(stringstream, funclist): for string in stringstream: for f in funclist: string = f(string) yield string def cut_after(word="gruss"): return lambda string: string.rpartition(word)[0] if word in string else string def seperate_words_on_regex(regex=regex_specialChars): return lambda string: " ".join(re.compile(regex).split(string)) def remove_words_containing_topLVL(): return lambda string: " ".join([w.lower() for w in string.split() if not re.search(regex_topLvl, w) ]) def replaceSpecialChars(replace_with=" "): return lambda string: re.sub(regex_specialChars, replace_with, string.lower()) def replaceNumbers(replace_with="NUMBER"): return lambda string : textacy.preprocess.replace_numbers(string.lower(), replace_with=replace_with) def replacePhonenumbers(replace_with="PHONENUMBER"): return lambda string: textacy.preprocess.replace_phone_numbers(string.lower(), replace_with=replace_with) def replaceSharpS(replace_with="ss"): return lambda string: re.sub(r'[ß]',replace_with,string.lower()) def fixUnicode(): return lambda string: textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC') """ """ def lemmatizeWord(word,filepath=LEMMAS): for line in list(textacy.fileio.read_file_lines(filepath=filepath)): if word.lower() == line.split()[1].strip().lower(): return line.split()[0].strip().lower() return word.lower() # falls nix gefunden wurde def create_lemma_dicts(lemmalist=LEMMAS): w_dict = {} lem_dict = {} for i, line in enumerate(lemmalist): try: lem_word_pair = line.split() if len(lem_word_pair) != 2: print(line) lemma = lem_word_pair[0].strip().lower() word = lem_word_pair[1].strip().lower() except: print(line) if lemma not in lem_dict: lem_dict[lemma] = i if word not in w_dict: w_dict[word] = lem_dict[lemma] l_dict = {v: k for k, v in lem_dict.items()} # switch key/values return l_dict,w_dict lemma_dict,word_dict = create_lemma_dicts() def lemmatizeWord(word,l_dict=lemma_dict,w_dict=word_dict): #mehrmals machen for i in range(3): try: word = l_dict[w_dict[word.lower()]] if word.lower() in w_dict else word.lower() except: print(word) return word def lemmatize(): return lambda doc: " ".join([lemmatizeWord(tok.lower_) for tok in doc]) def lemmatize(): return lambda string: " ".join([lemmatizeWord(s.lower()) for s in string.split()]) DE_SPELLCHECKER = enchant.Dict("de_DE") EN_SPELLCHECKER = enchant.Dict("en_US") def autocorrectWord(word,spellchecker=DE_SPELLCHECKER): try: return spellchecker.suggest(word)[0] if not spellchecker.check(word) else word except: return word def autocorrect(): return lambda string: " ".join([autocorrectWord(s.lower()) for s in string.split()]) """ def create_lemma_dicts(lemmalist=LEMMAS): w_dict = {} lem_dict = {} for i, line in enumerate(lemmalist): try: lem_word_pair = line.split() if len(lem_word_pair) != 2: print(line) lemma = lem_word_pair[0].strip().lower() word = lem_word_pair[1].strip().lower() except: print(line) if lemma not in lem_dict: lem_dict[lemma] = i if word not in w_dict: w_dict[word] = lem_dict[lemma] l_dict = {v: k for k, v in lem_dict.items()} # switch key/values return l_dict, w_dict lemma_dict, word_dict = create_lemma_dicts() def lemmatizeWord(word, l_dict=lemma_dict, w_dict=word_dict, n=3): # mehrmals machen for i in range(n): try: word = l_dict[w_dict[word.lower()]] if word.lower() in w_dict else word.lower() except: print(word) return word def build_thesaurus(path2lexicalentries, path2synsets): lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8")) syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8")) lexroot = lextree.getroot() synroot = syntree.getroot() thesaurus = [] for r in synroot: for element in r: if element.tag == "Synset": sysnet = [] attrib = element.attrib id = attrib["id"] for ro in lexroot: for elem in ro: if elem.tag == "LexicalEntry": subs_dicts = [subentry.attrib for subentry in elem] # : [{'partOfSpeech': 'n', 'writtenForm': 'Kernspaltung'}, {'synset': 'de-1-n', 'id': 'w1_1-n'}] dic = {k: v for x in subs_dicts for k, v in x.items()} # to one dict if "synset" in dic.keys(): if dic["synset"] == id: string = (dic["writtenForm"]) # replaceRockDots string = re.sub(r'[ß]', "ss", string) string = re.sub(r'[ö]', "oe", string) string = re.sub(r'[ü]', "ue", string) string = re.sub(r'[ä]', "ae", string) # alle punkte raus string = re.sub(r'[.]', "", string) # alles in klammern raus string = re.sub(r"\((.*)\)", " ", string) # längeres leerzeichen normalisieren string = textacy.preprocess.normalize_whitespace(string) sysnet.append(string.lower().strip()) # nach anzhal der wörter in den strings sortieren sysnet.sort(key=lambda x: len(x.split())) if len(sysnet) != 0: # todo warum sind manche leer? thesaurus.append(sysnet) return thesaurus printlog("Build Thesaurus") THESAURUS = [] THESAURUS = build_thesaurus(path2lexicalentries=lexicalentries, path2synsets=synsets) def getFirstSynonym(word, thesaurus=THESAURUS): if not isinstance(word, str): return str(word) word = word.lower() # durch den thesaurrus iterieren for syn_block in thesaurus: # syn_block ist eine liste mit Synonymen for syn in syn_block: syn = syn.lower() if re.match(r'\A[\w-]+\Z', syn): # falls syn einzelwort ist todo phrasen auch normalisieren if word == syn: return syn_block[0] return str(word) # zur Not das ursrpüngliche Wort zurückgeben ########################## Spellchecking ########################################## # http://norvig.com/spell-correct.html # http://wortschatz.uni-leipzig.de/en/download import re from collections import Counter def words(text): return re.findall(r'\w+', text.lower()) printlog("Build Wordlist for Spellchecking") WORDS = {} WORDS = Counter(words(open(path2words).read())) def P(word, N=sum(WORDS.values())): "Probability of `word`." return WORDS[word] / N def correction(word): "Most probable spelling correction for word." return max(candidates(word), key=P) def candidates(word): "Generate possible spelling corrections for word." return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word]) def known(words): "The subset of `words` that appear in the dictionary of WORDS." return set(w for w in words if w in WORDS) def edits1(word): "All edits that are one edit away from `word`." letters = 'abcdefghijklmnopqrstuvwxyz' splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] deletes = [L + R[1:] for L, R in splits if R] transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1] replaces = [L + c + R[1:] for L, R in splits if R for c in letters] inserts = [L + c + R for L, R in splits for c in letters] return set(deletes + transposes + replaces + inserts) def edits2(word): "All edits that are two edits away from `word`." return (e2 for e1 in edits1(word) for e2 in edits1(e1)) """ DE_SPELLCHECKER = enchant.Dict("de_DE") EN_SPELLCHECKER = enchant.Dict("en_US") def autocorrectWord(word, spellchecker=DE_SPELLCHECKER): try: return spellchecker.suggest(word)[0] if not spellchecker.check(word) else word except: return word """ def autocorrectWord(word): try: return correction(word) except: return word ################################################################################################## ############# stringcleaning def stringcleaning(stringstream): regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|?]' regex_topLvl = r'\.[a-z]{2,3}(\.[a-z]{2,3})?' for string in stringstream: string = string.lower() # fixUnicode string = textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC') # remove_words_containing_topLVL string = " ".join([w.lower() for w in string.split() if not re.search(regex_topLvl, w)]) # replaceRockDots string = re.sub(r'[ß]', "ss", string) string = re.sub(r'[ö]', "oe", string) string = re.sub(r'[ü]', "ue", string) string = re.sub(r'[ä]', "ae", string) # seperate_words_on_regex: string = " ".join(re.compile(regex_specialChars).split(string)) # cut_after word = "gruss" string = string.rpartition(word)[0] if word in string else string # lemmatize string = " ".join([lemmatizeWord(word) for word in string.split()]) # synonyme normalisieren #idee vor oder nach lemmatize? string = " ".join([getFirstSynonym(word) for word in string.split()]) # autocorrect string = " ".join([autocorrectWord(word) for word in string.split()]) yield string def processContentstream(textstream, token_filterlist=None, parser=DE_PARSER): """ :param textstream: string-gen :param funclist: [func] :param parser: spacy-parser :return: string-gen """ """ filter_tokens=[ #removeENT(["PERSON"]), #idee addressen enfernen #bisher mit cut_after("gruss") --> postal.parser #idee rechtschreibkorrektur --> PyEnchant #idee thesaurus --> WordNet, eigener remove_words_containing_Numbers(), removePOS(["PUNCT","SPACE","NUM"]), removeWords(de_stop_words+custom_words), remove_long_words(), remove_short_words(), remove_first_names(), keepPOS(["NOUN"]), ] """ # pre_parse textstream = stringcleaning(textstream) pipe = parser.pipe(textstream) tokens = [] for doc in pipe: tokens = [tok for tok in doc] # in_parse if token_filterlist is not None: tokens = filterTokens(tokens, token_filterlist) yield " ".join([tok.lower_ for tok in tokens]) # yield " ".join(list(set([tok.lower_ for tok in tokens]))) def processDictstream(dictstream, funcdict, parser=DE_PARSER): """ :param dictstream: dict-gen :param funcdict: clean_in_meta = { "Solution":funclist, ... } :param parser: spacy-parser :return: dict-gen """ for dic in dictstream: result = {} for key, value in dic.items(): if key in funcdict: doc = parser(value) tokens = [tok for tok in doc] funclist = funcdict[key] tokens = filterTokens(tokens, funclist) result[key] = " ".join([tok.lower_ for tok in tokens]) else: result[key] = value yield result def filterTokens(tokens, funclist): # in:tokenlist, funclist # out: tokenlist for f in funclist: tokens = list(filter(f, tokens)) return tokens custom_words = ["geehrt", "dame", "herr", "hilfe", "problem", "lauten", "bedanken", "voraus", "hallo", "gerne", "freundlich", "fragen", "fehler", "bitten", "ehre", "lieb", "helfen", "versuchen", "unbestimmt", "woche", "tadelos", "klappen", "mittlerweile", "bekommen", "erreichbar", "gruss", "auffahren", "vorgang", "hinweis", "institut", "universitaet", "name", "gruss", "id", "erfolg", "mail","folge", "nummer", "team", "fakultaet", "email", "absender", "tu", "versenden", "vorname", "message", "service", "strasse", "prozess", "portal", "raum", "personal", "moeglichkeit", "fremd", "wende", "rueckfrage", "stehen", "verfuegung", "funktionieren", "kollege", "pruefen", "hoffen" ] filter_tokens = [ # removeENT(["PERSON"]), # idee addressen enfernen #bisher mit cut_after("gruss") --> postal.parser keepNouns(), remove_words_containing_Numbers(), removePOS(["PUNCT", "SPACE", "NUM"]), removeWords(de_stop_words + custom_words), remove_long_words(), remove_short_words(), remove_first_names() ] metaliste = [ "Subject", "categoryName", "Solution" ] clean_in_meta = { "Solution": [removePOS(["SPACE"])], "Subject": [removePOS(["SPACE", "PUNCT"])], "categoryName": [removePOS(["SPACE", "PUNCT"])] } """ pipe=[ ##String fixUnicode(), replaceHardS(), resolveAbbrivations(), remove_words_containing_topLVL(), replaceSpecialChars(" "), (mit Leerzeichen erstzen, dadruch werden Terme wie 8203;verfügung getrennt remove_words_containing_Numbers(), ##spacyParse removeENT("PERSON"), keepPOS(["NOUN"]), #ODER lemmatize(), removeWords(de_stop_words + config.get("preprocessing","custom_words").split(",")), # evtl. spellCorrection(), keepUniqeTokens(), ] """ de_corpus = textacy.Corpus(DE_PARSER) en_corpus = textacy.Corpus(EN_PARSER) ## add files to textacy-corpus, printlog("Add texts to textacy-corpus") de_corpus.add_texts( processContentstream(csv_to_contentStream(path2de_csv, "Description"), token_filterlist=filter_tokens), processDictstream(csv_to_metaStream(path2de_csv, metaliste), clean_in_meta) ) # leere docs aus corpus kicken de_corpus.remove(lambda doc: len(doc) == 0) for i in range(20): printRandomDoc(de_corpus) #save corpus corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/" corpus_name = "de_corpus" save_corpus(corpus=de_corpus,corpus_path=corpus_path,corpus_name=corpus_name) #todo das selbe mit en_corpus end = time.time() printlog("Time Elapsed Preprocessing:{0} min".format((end - start) / 60))