# -*- coding: utf-8 -*- from datetime import datetime print(datetime.now()) #path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv" path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv" path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv" path_csv_split = path2csv.split("/") print(path_csv_split[len(path_csv_split)-1]) import time import enchant start = time.time() import logging import csv import functools import os.path import re import subprocess import time import xml.etree.ElementTree as ET import sys import spacy import textacy from scipy import * from textacy import Vectorizer import warnings import configparser as ConfigParser import sys import hunspell from postal.parser import parse_address csv.field_size_limit(sys.maxsize) #ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/testo.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout.log &" # todo configuration file """ config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini" config = ConfigParser.ConfigParser() with open(config_ini) as f: config.read_file(f) """ logile = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log" # config logging logging.basicConfig(filename=logile, level=logging.INFO) #logging.basicConfig(filename=config.get("filepath","logfile"), level=logging.INFO) thesauruspath = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/openthesaurus.csv" #thesauruspath = config.get("filepath","thesauruspath") THESAURUS = list(textacy.fileio.read_csv(thesauruspath, delimiter=";")) from langdetect import detect DE_PARSER = spacy.load("de") #todo spacherkennung idee: verschiedene Corpi für verschiedene Sprachen #EN_PARSER = spacy.load("en") def replaceRockDots(): return lambda string: re.sub(r'[ß]', "ss", (re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower()))))))) """ de_stop_words= set( list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS) + list(textacy.fileio.read_file_lines("stopwords-de.txt")) ) LEMMAS = list(textacy.fileio.read_file_lines(filepath="lemmatization-de.txt")) VORNAMEN = list(textacy.fileio.read_file_lines("vornamen.txt")) """ de_stop_words = list(map(textacy.preprocess.normalize_whitespace,textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stop_words.txt"))) #en_stop_words= set(list(__import__("spacy." + EN_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)) LEMMAS = list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmas.txt")) VORNAMEN = list(map(textacy.preprocess.normalize_whitespace,textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames.txt"))) NOUNS = list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen2.txt")) NOUNS = NOUNS +list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen.txt")) NOUNS = list(map(textacy.preprocess.normalize_whitespace, NOUNS)) print(de_stop_words[10:30]) print(LEMMAS[10:30]) print(VORNAMEN[10:30]) print(NOUNS[10:30]) mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE) emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE) urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE) topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE) specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE) hardSFinder = re.compile(r'[ß]', re.IGNORECASE) def printlog(string, level="INFO"): """log and prints""" print(string) if level=="INFO": logging.info(string) elif level=="DEBUG": logging.debug(string) elif level == "WARNING": logging.warning(string) printlog(str(datetime.now())) printlog("Load functions") def compose(*functions): def compose2(f, g): return lambda x: f(g(x)) return functools.reduce(compose2, functions, lambda x: x) def get_calling_function(): """finds the calling function in many decent cases. https://stackoverflow.com/questions/39078467/python-how-to-get-the-calling-function-not-just-its-name """ fr = sys._getframe(1) # inspect.stack()[1][0] co = fr.f_code for get in ( lambda:fr.f_globals[co.co_name], lambda:getattr(fr.f_locals['self'], co.co_name), lambda:getattr(fr.f_locals['cls'], co.co_name), lambda:fr.f_back.f_locals[co.co_name], # nested lambda:fr.f_back.f_locals['func'], # decorators lambda:fr.f_back.f_locals['meth'], lambda:fr.f_back.f_locals['f'], ): try: func = get() except (KeyError, AttributeError): pass else: if func.__code__ == co: return func raise AttributeError("func not found") def printRandomDoc(textacyCorpus): import random print() printlog("len(textacyCorpus) = %i" % len(textacyCorpus)) randIndex = int((len(textacyCorpus) - 1) * random.random()) printlog("Index: {0} ; Text: {1} ; Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata)) print() def csv_to_contentStream(path2csv: str, content_collumn_name: str): """ :param path2csv: string :param content_collumn_name: string :return: string-generator """ stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8') content_collumn = 0 # standardvalue for i,lst in enumerate(stream): if i == 0: # look for desired column for j,col in enumerate(lst): if col == content_collumn_name: content_collumn = j else: yield lst[content_collumn] def csv_to_metaStream(path2csv: str, metalist: [str]): """ :param path2csv: string :param metalist: list of strings :return: dict-generator """ stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8') content_collumn = 0 # standardvalue metaindices = [] metadata_temp = {} for i, lst in enumerate(stream): if i == 0: for j, col in enumerate(lst): # geht bestimmt effizienter... egal, weil passiert nur einmal for key in metalist: if key == col: metaindices.append(j) metadata_temp = dict( zip(metalist, metaindices)) # zB {'Subject' : 1, 'categoryName' : 3, 'Solution' : 10} else: metadata = metadata_temp.copy() for key, value in metadata.items(): metadata[key] = lst[value] yield metadata ############# filter tokens def keepPOS(pos_list): return lambda tok : tok.pos_ in pos_list def keepNouns(noun_list=NOUNS): return lambda tok : tok.lower_ in noun_list def removePOS(pos_list): return lambda tok : tok.pos_ not in pos_list def removeWords(words, keep=None): if hasattr(keep, '__iter__'): for k in keep: try: words.remove(k) except ValueError: pass return lambda tok : tok.lower_ not in words def keepENT(ent_list): return lambda tok : tok.ent_type_ in ent_list def removeENT(ent_list): return lambda tok: tok.ent_type_ not in ent_list def remove_words_containing_Numbers(): return lambda tok: not bool(re.search('\d', tok.lower_)) """ def remove_words_containing_topLVL(): return lambda tok: not bool(re.search(regex_topLvl, tok.lower_)) def remove_words_containing_specialCharacters(): return lambda tok: not bool(re.search(regex_specialChars, tok.lower_)) """ def remove_long_words(): return lambda tok: not len(tok.lower_) < 2 def remove_short_words(): return lambda tok: not len(tok.lower_) > 35 def remove_first_names(): return lambda tok: tok.lower_ not in [name.lower() for name in VORNAMEN] ############# strings def remove_addresses(string): pass #todo """ def stringcleaning(stringstream, funclist): for string in stringstream: for f in funclist: string = f(string) yield string def cut_after(word="gruss"): return lambda string: string.rpartition(word)[0] if word in string else string def seperate_words_on_regex(regex=regex_specialChars): return lambda string: " ".join(re.compile(regex).split(string)) def remove_words_containing_topLVL(): return lambda string: " ".join([w.lower() for w in string.split() if not re.search(regex_topLvl, w) ]) def replaceSpecialChars(replace_with=" "): return lambda string: re.sub(regex_specialChars, replace_with, string.lower()) def replaceNumbers(replace_with="NUMBER"): return lambda string : textacy.preprocess.replace_numbers(string.lower(), replace_with=replace_with) def replacePhonenumbers(replace_with="PHONENUMBER"): return lambda string: textacy.preprocess.replace_phone_numbers(string.lower(), replace_with=replace_with) def replaceSharpS(replace_with="ss"): return lambda string: re.sub(r'[ß]',replace_with,string.lower()) def fixUnicode(): return lambda string: textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC') """ """ def lemmatizeWord(word,filepath=LEMMAS): for line in list(textacy.fileio.read_file_lines(filepath=filepath)): if word.lower() == line.split()[1].strip().lower(): return line.split()[0].strip().lower() return word.lower() # falls nix gefunden wurde def create_lemma_dicts(lemmalist=LEMMAS): w_dict = {} lem_dict = {} for i, line in enumerate(lemmalist): try: lem_word_pair = line.split() if len(lem_word_pair) != 2: print(line) lemma = lem_word_pair[0].strip().lower() word = lem_word_pair[1].strip().lower() except: print(line) if lemma not in lem_dict: lem_dict[lemma] = i if word not in w_dict: w_dict[word] = lem_dict[lemma] l_dict = {v: k for k, v in lem_dict.items()} # switch key/values return l_dict,w_dict lemma_dict,word_dict = create_lemma_dicts() def lemmatizeWord(word,l_dict=lemma_dict,w_dict=word_dict): #mehrmals machen for i in range(3): try: word = l_dict[w_dict[word.lower()]] if word.lower() in w_dict else word.lower() except: print(word) return word def lemmatize(): return lambda doc: " ".join([lemmatizeWord(tok.lower_) for tok in doc]) def lemmatize(): return lambda string: " ".join([lemmatizeWord(s.lower()) for s in string.split()]) DE_SPELLCHECKER = enchant.Dict("de_DE") EN_SPELLCHECKER = enchant.Dict("en_US") def autocorrectWord(word,spellchecker=DE_SPELLCHECKER): try: return spellchecker.suggest(word)[0] if not spellchecker.check(word) else word except: return word def autocorrect(): return lambda string: " ".join([autocorrectWord(s.lower()) for s in string.split()]) """ def create_lemma_dicts(lemmalist=LEMMAS): w_dict = {} lem_dict = {} for i, line in enumerate(lemmalist): try: lem_word_pair = line.split() if len(lem_word_pair) != 2: print(line) lemma = lem_word_pair[0].strip().lower() word = lem_word_pair[1].strip().lower() except: print(line) if lemma not in lem_dict: lem_dict[lemma] = i if word not in w_dict: w_dict[word] = lem_dict[lemma] l_dict = {v: k for k, v in lem_dict.items()} # switch key/values return l_dict, w_dict lemma_dict, word_dict = create_lemma_dicts() def lemmatizeWord(word, l_dict=lemma_dict, w_dict=word_dict, n=3): # mehrmals machen for i in range(n): try: word = l_dict[w_dict[word.lower()]] if word.lower() in w_dict else word.lower() except: print(word) return word DE_SPELLCHECKER = enchant.Dict("de_DE") EN_SPELLCHECKER = enchant.Dict("en_US") def autocorrectWord(word, spellchecker=DE_SPELLCHECKER): try: return spellchecker.suggest(word)[0] if not spellchecker.check(word) else word except: return word ############# stringcleaning def stringcleaning(stringstream): regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|?]' regex_topLvl = r'\.[a-z]{2,3}(\.[a-z]{2,3})?' for string in stringstream: string = string.lower() # fixUnicode string = textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC') # remove_words_containing_topLVL string = " ".join([w.lower() for w in string.split() if not re.search(regex_topLvl, w)]) # replaceRockDots string = re.sub(r'[ß]', "ss", string) string = re.sub(r'[ö]', "oe", string) string = re.sub(r'[ü]', "ue", string) string = re.sub(r'[ä]', "ae", string) # seperate_words_on_regex: string = " ".join(re.compile(regex_specialChars).split(string)) # cut_after word = "gruss" string = string.rpartition(word)[0] if word in string else string # lemmatize string = " ".join([lemmatizeWord(word) for word in string.split()]) # autocorrect #string = " ".join([autocorrectWord(word) for word in string.split()]) yield string def processContentstream(textstream, token_filterlist=None, parser=DE_PARSER): """ :param textstream: string-gen :param funclist: [func] :param parser: spacy-parser :return: string-gen """ """ filter_tokens=[ #removeENT(["PERSON"]), #idee addressen enfernen #bisher mit cut_after("gruss") --> postal.parser #idee rechtschreibkorrektur --> PyEnchant #idee thesaurus --> WordNet, eigener remove_words_containing_Numbers(), removePOS(["PUNCT","SPACE","NUM"]), removeWords(de_stop_words+custom_words), remove_long_words(), remove_short_words(), remove_first_names(), keepPOS(["NOUN"]), ] """ #pre_parse textstream = stringcleaning(textstream) pipe = parser.pipe(textstream) tokens=[] for doc in pipe: tokens = [tok for tok in doc] #print(" ".join([tok.lower_ for tok in tokens])) # in_parse if token_filterlist is not None: tokens = filterTokens(tokens, token_filterlist) #yield " ".join([tok.lower_ for tok in tokens]) yield " ".join(list(set([tok.lower_ for tok in tokens]))) def processDictstream(dictstream, funcdict, parser=DE_PARSER): """ :param dictstream: dict-gen :param funcdict: clean_in_meta = { "Solution":funclist, ... } :param parser: spacy-parser :return: dict-gen """ for dic in dictstream: result = {} for key, value in dic.items(): if key in funcdict: doc = parser(value) tokens = [tok for tok in doc] funclist = funcdict[key] tokens = filterTokens(tokens, funclist) result[key] = " ".join([tok.lower_ for tok in tokens]) else: result[key] = value yield result def filterTokens(tokens, funclist): # in:tokenlist, funclist # out: tokenlist for f in funclist: tokens = list(filter(f, tokens)) return tokens custom_words=["geehrt","dame","herr","hilfe","problem","lauten","bedanken","voraus", "hallo","gerne","freundlich","fragen","fehler","bitten","ehre", "lieb","helfen" "versuchen","unbestimmt","woche","tadelos", "klappen" ,"mittlerweile", "bekommen","erreichbar","gruss", "auffahren","vorgang","hinweis","institut","universitaet","name","gruss","id","erfolg","mail","folge", "nummer","team","fakultaet","email","absender","tu","versenden","vorname","message", "service","strasse","prozess","portal","raum","personal","moeglichkeit","fremd","wende","rueckfrage", "stehen", "verfuegung" "funktionieren","kollege", "pruefen" ] filter_tokens=[ #removeENT(["PERSON"]), #idee addressen enfernen #bisher mit cut_after("gruss") --> postal.parser #idee rechtschreibkorrektur --> PyEnchant #idee thesaurus --> WordNet keepNouns(), remove_words_containing_Numbers(), removePOS(["PUNCT","SPACE","NUM"]), removeWords(de_stop_words+custom_words), remove_long_words(), remove_short_words(), remove_first_names() #keepPOS(["NOUN"]), ] metaliste = [ "Subject", "categoryName", "Solution" ] clean_in_meta = { "Solution":[removePOS(["SPACE"])], "Subject":[removePOS(["SPACE","PUNCT"])], "categoryName": [removePOS(["SPACE", "PUNCT"])] } """ pipe=[ ##String fixUnicode(), replaceHardS(), resolveAbbrivations(), remove_words_containing_topLVL(), replaceSpecialChars(" "), (mit Leerzeichen erstzen, dadruch werden Terme wie 8203;verfügung getrennt remove_words_containing_Numbers(), ##spacyParse removeENT("PERSON"), keepPOS(["NOUN"]), #ODER lemmatize(), removeWords(de_stop_words + config.get("preprocessing","custom_words").split(",")), # evtl. spellCorrection(), keepUniqeTokens(), ] """ de_corpus = textacy.Corpus(DE_PARSER) #en_corpus = textacy.Corpus(EN_PARSER) ## add files to textacy-corpus, printlog("add texts to textacy-corpus") de_corpus.add_texts( processContentstream(csv_to_contentStream(path2csv,"Description"), token_filterlist=filter_tokens), processDictstream(csv_to_metaStream(path2csv,metaliste),clean_in_meta) ) for i in range(10): printRandomDoc(de_corpus) end = time.time() printlog("Time Elapsed Preprocessing:{0} min".format((end - start)/60)) """ corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/" corpus_name = "de_corpus" corpus_compression = 'gzip' de_corpus.save(corpus_path, name=corpus_name, compression=corpus_compression) de_corpus = textacy.Corpus.load(corpus_path, name=corpus_name, compression=corpus_compression) """ # build citionary of ticketcategories labelist = [] for texdoc in de_corpus.get(lambda texdoc : texdoc.metadata["categoryName"] not in labelist): labelist.append(texdoc.metadata["categoryName"]) LABELDICT = {k: v for v, k in enumerate(labelist)} printlog(str("LABELDICT: {0}".format(LABELDICT))) def topicModeling(ngrams,min_df,max_df,topicModel = 'lda',n_topics = len(LABELDICT),named_entities=False,corpus=de_corpus): printlog("############################################ Topic Modeling {0} #############################################".format(topicModel)) print("\n\n") printlog(str("ngrams: {0}".format(ngrams))) printlog(str("min_df: {0}".format(min_df))) printlog(str("max_df: {0}".format(max_df))) printlog(str("n_topics: {0}".format(n_topics))) printlog(str("named_entities: {0}".format(named_entities))) start = time.time() top_topic_words = 10 top_document_labels_per_topic = 5 # http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix weighting = ('tf' if topicModel == 'lda' else 'tfidf') ####################'#################### #printlog("vectorize corpus...") vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df) terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in corpus) doc_term_matrix = vectorizer.fit_transform(terms_list) id2term = vectorizer.__getattribute__("id_to_term") #printlog("terms_list: {0}".format(list(terms_list))) #printlog("doc_term_matrix: {0}".format(doc_term_matrix)) ##################### LSA, LDA, NMF Topic Modeling via Textacy ############################################## # Initialize and train a topic model #printlog("Initialize and train a topic model..") model = textacy.tm.TopicModel(topicModel, n_topics=n_topics) model.fit(doc_term_matrix) #Transform the corpus and interpret our model: #printlog("Transform the corpus and interpret our model..") doc_topic_matrix = model.transform(doc_term_matrix) print() for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words): printlog('topic {0}: {1}'.format(topic_idx, " ".join(top_terms))) print() for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic): printlog(topic_idx) for j in top_docs: printlog(corpus[j].metadata['categoryName']) print() ##################################################################################################################### print() print() end = time.time() printlog("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start)/60,topicModel)) #no_below = 20 #no_above = 0.5 #n_topics = len(LABELDICT)#len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic topicModeling(ngrams = 1, min_df = 1, max_df = 1.0, topicModel = 'lda', n_topics = len(LABELDICT), corpus=de_corpus) topicModeling(ngrams = 1, min_df = 0.1, max_df = 0.6, topicModel = 'lda', n_topics = len(LABELDICT), corpus=de_corpus) topicModeling(ngrams = (1,2), min_df = 1, max_df = 1.0, topicModel = 'lda', n_topics = len(LABELDICT), corpus=de_corpus) topicModeling(ngrams = (1,2), min_df = 0.1, max_df = 0.6, topicModel = 'lda', n_topics = len(LABELDICT), corpus=de_corpus) topicModeling(ngrams = (1,2), min_df = 0.2, max_df = 0.8, topicModel = 'lda', n_topics = 20, corpus=de_corpus) """ ##################### LLDA Topic Modeling via JGibbsLabledLDA ############################################## print("\n\n") start = time.time() n_topics = len(LABELDICT) #len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic # build citionary of ticketcategories labelist = [] for texdoc in de_corpus.get(lambda texdoc : texdoc.metadata["categoryName"] not in labelist): labelist.append(texdoc.metadata["categoryName"]) LABELDICT = {k: v for v, k in enumerate(labelist)} print(LABELDICT) def label2ID(label,labeldict=LABELDICT): return labeldict.get(label,len(labeldict)) def generate_labled_lines(textacyCorpus): for doc in textacyCorpus: # generate [topic1, topic2....] tok1 tok2 tok3 out of corpus yield "[" + str(label2ID(doc.metadata["categoryName"])) + "] " + doc.text jgibbsLLDA_root = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/java_LabledLDA/" LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root) #create file textacy.fileio.write_file_lines(generate_labled_lines(de_corpus), filepath=LLDA_filepath) # wait for file to exist while not os.path.exists(LLDA_filepath): time.sleep(1) print("\n\n") printlog("start LLDA:") #run JGibsslda file FNULL = open(os.devnull, 'w') # supress output subprocess.call(["java", "-cp", "{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format(jgibbsLLDA_root), "jgibblda.LDA", "-est", "-dir", "{0}models/tickets".format(jgibbsLLDA_root), "-dfile","tickets.gz", "-twords",str(top_topic_words), "-ntopics", str(n_topics)], stdout = FNULL) # ANMERKUNG: Dateien sind versteckt. zu finden in models/ #twords subprocess.call(["gzip", "-dc", "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)]) ##################################################################################################################### print() print() end = time.time() printlog("\n\n\nTime Elapsed Topic Modeling JGibbsLLDA:{0} min\n\n".format((end - start)/60)) """