From 3bfbebc894aefe5d636f0c422adc51fc368e7ba5 Mon Sep 17 00:00:00 2001 From: "jannis.grundmann" Date: Mon, 2 Oct 2017 14:31:33 +0200 Subject: [PATCH] thesaurus fertiggestellt --- testo.py | 303 ++++++++++++++++++++++++++++++++++++++++++++---------- testra.py | 108 ++++++------------- 2 files changed, 282 insertions(+), 129 deletions(-) diff --git a/testo.py b/testo.py index 2c4ee97..b3ff86c 100644 --- a/testo.py +++ b/testo.py @@ -8,7 +8,7 @@ print(datetime.now()) #path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv" path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv" -path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv" +#path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv" path_csv_split = path2csv.split("/") print(path_csv_split[len(path_csv_split)-1]) @@ -62,9 +62,17 @@ logging.basicConfig(filename=logile, level=logging.INFO) #logging.basicConfig(filename=config.get("filepath","logfile"), level=logging.INFO) -thesauruspath = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/openthesaurus.csv" +#thesauruspath = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/openthesaurus.csv" #thesauruspath = config.get("filepath","thesauruspath") -THESAURUS = list(textacy.fileio.read_csv(thesauruspath, delimiter=";")) +#THESAURUS = list(textacy.fileio.read_csv(thesauruspath, delimiter=";")) + + +# THESAURUS +lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml" +synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml" + +path2words = '/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt' + from langdetect import detect @@ -90,9 +98,9 @@ LEMMAS = list(textacy.fileio.read_file_lines(filepath="lemmatization-de.txt")) VORNAMEN = list(textacy.fileio.read_file_lines("vornamen.txt")) """ - +from nltk.corpus import stopwords de_stop_words = list(map(textacy.preprocess.normalize_whitespace,textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stop_words.txt"))) - +de_stop_words = de_stop_words + list(set(stopwords.words('english'))) #en_stop_words= set(list(__import__("spacy." + EN_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)) LEMMAS = list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmas.txt")) @@ -430,8 +438,141 @@ def lemmatizeWord(word, l_dict=lemma_dict, w_dict=word_dict, n=3): print(word) return word +def build_thesaurus(path2lexicalentries, path2synsets): + + lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8")) + syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8")) + + lexroot = lextree.getroot() + synroot = syntree.getroot() + + thesaurus=[] + + for r in synroot: + for element in r: + + if element.tag == "Synset": + sysnet = [] + attrib = element.attrib + id = attrib["id"] + + + for ro in lexroot: + for elem in ro: + if elem.tag == "LexicalEntry": + subs_dicts = [subentry.attrib for subentry in elem] + #: [{'partOfSpeech': 'n', 'writtenForm': 'Kernspaltung'}, {'synset': 'de-1-n', 'id': 'w1_1-n'}] + + dic = {k:v for x in subs_dicts for k,v in x.items()} # to one dict + if "synset" in dic.keys(): + if dic["synset"] == id: + + string = (dic["writtenForm"]) + + # replaceRockDots + string = re.sub(r'[ß]', "ss", string) + string = re.sub(r'[ö]', "oe", string) + string = re.sub(r'[ü]', "ue", string) + string = re.sub(r'[ä]', "ae", string) + + + # alle punkte raus + string = re.sub(r'[.]', "", string) + + # alles in klammern raus + string = re.sub(r"\((.*)\)", " ", string) + + # längeres leerzeichen normalisieren + string = textacy.preprocess.normalize_whitespace(string) + + + sysnet.append(string.lower().strip()) + + # nach anzhal der wörter in den strings sortieren + sysnet.sort(key=lambda x: len(x.split())) + if len(sysnet) != 0: + #todo warum sind manche leer? + thesaurus.append(sysnet) + return thesaurus + + + + +THESAURUS=[] +#THESAURUS=build_thesaurus(path2lexicalentries=lexicalentries,path2synsets=synsets) #todo anschalten + +def getFirstSynonym(word, thesaurus=THESAURUS): + if not isinstance(word, str): + return str(word) + + word = word.lower() + + # durch den thesaurrus iterieren + for syn_block in thesaurus: # syn_block ist eine liste mit Synonymen + + for syn in syn_block: + syn = syn.lower() + if re.match(r'\A[\w-]+\Z', syn): # falls syn einzelwort ist todo phrasen auch normalisieren + if word == syn: + return syn_block[0] + + return str(word) # zur Not das ursrpüngliche Wort zurückgeben + + + + + + + +########################## Spellchecking ########################################## + +#http://norvig.com/spell-correct.html + +#http://wortschatz.uni-leipzig.de/en/download + +import re +from collections import Counter + + + +def words(text): return re.findall(r'\w+', text.lower()) + + +WORDS={} +#WORDS = Counter(words(open(path2words).read())) #todo anschalten + +def P(word, N=sum(WORDS.values())): + "Probability of `word`." + return WORDS[word] / N + +def correction(word): + "Most probable spelling correction for word." + return max(candidates(word), key=P) + +def candidates(word): + "Generate possible spelling corrections for word." + return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word]) + +def known(words): + "The subset of `words` that appear in the dictionary of WORDS." + return set(w for w in words if w in WORDS) + +def edits1(word): + "All edits that are one edit away from `word`." + letters = 'abcdefghijklmnopqrstuvwxyz' + splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] + deletes = [L + R[1:] for L, R in splits if R] + transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1] + replaces = [L + c + R[1:] for L, R in splits if R for c in letters] + inserts = [L + c + R for L, R in splits for c in letters] + return set(deletes + transposes + replaces + inserts) + +def edits2(word): + "All edits that are two edits away from `word`." + return (e2 for e1 in edits1(word) for e2 in edits1(e1)) +""" DE_SPELLCHECKER = enchant.Dict("de_DE") EN_SPELLCHECKER = enchant.Dict("en_US") @@ -440,6 +581,18 @@ def autocorrectWord(word, spellchecker=DE_SPELLCHECKER): return spellchecker.suggest(word)[0] if not spellchecker.check(word) else word except: return word +""" + +def autocorrectWord(word): + try: + return correction(word) + except: + return word + + +################################################################################################## + + ############# stringcleaning @@ -475,6 +628,9 @@ def stringcleaning(stringstream): # lemmatize string = " ".join([lemmatizeWord(word) for word in string.split()]) + # synonyme normalisieren #idee vor oder nach lemmatize? + #string = " ".join([getFirstSynonym(word) for word in string.split()]) + # autocorrect #string = " ".join([autocorrectWord(word) for word in string.split()]) @@ -534,10 +690,8 @@ def processContentstream(textstream, token_filterlist=None, parser=DE_PARSER): tokens = filterTokens(tokens, token_filterlist) - - - #yield " ".join([tok.lower_ for tok in tokens]) - yield " ".join(list(set([tok.lower_ for tok in tokens]))) + yield " ".join([tok.lower_ for tok in tokens]) + #yield " ".join(list(set([tok.lower_ for tok in tokens]))) @@ -602,8 +756,6 @@ custom_words=["geehrt","dame","herr","hilfe","problem","lauten","bedanken","vora filter_tokens=[ #removeENT(["PERSON"]), #idee addressen enfernen #bisher mit cut_after("gruss") --> postal.parser - #idee rechtschreibkorrektur --> PyEnchant - #idee thesaurus --> WordNet keepNouns(), @@ -696,6 +848,11 @@ de_corpus.add_texts( processDictstream(csv_to_metaStream(path2csv,metaliste),clean_in_meta) ) + +# leere docs aus corpus kicken +de_corpus.remove(lambda doc: len(doc)==0) + + for i in range(10): printRandomDoc(de_corpus) @@ -706,6 +863,44 @@ end = time.time() printlog("Time Elapsed Preprocessing:{0} min".format((end - start)/60)) + +def printvecotorization(ngrams = 1,min_df = 1,max_df = 1.0,weighting ='tf',named_entities=True): + + printlog(str("ngrams: {0}".format(ngrams))) + printlog(str("min_df: {0}".format(min_df))) + printlog(str("max_df: {0}".format(max_df))) + printlog(str("named_entities: {0}".format(named_entities))) + + + + #printlog("vectorize corpus...") + vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df) + + terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in de_corpus) + doc_term_matrix = vectorizer.fit_transform(terms_list) + id2term = vectorizer.__getattribute__("id_to_term") + + + for t in terms_list: + print(t) + printlog("doc_term_matrix: {0}".format(doc_term_matrix)) + printlog("id2term: {0}".format(id2term)) + + + +# todo gescheites tf(-idf) maß finden #idee: tf wird bei token-set immer = 1 sein +ngrams = 1 +min_df = 1 +max_df = 1.0 +weighting = 'tf' +# weighting ='tfidf' +named_entities = False + +#printvecotorization(ngrams=ngrams,min_df=min_df,max_df=max_df,weighting=weighting,named_entities=named_entities) + + + + """ corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/" corpus_name = "de_corpus" @@ -809,7 +1004,7 @@ def topicModeling(ngrams,min_df,max_df,topicModel = 'lda',n_topics = len(LABELDI - +""" topicModeling(ngrams = 1, min_df = 1, max_df = 1.0, @@ -851,51 +1046,55 @@ topicModeling(ngrams = (1,2), - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - """ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##################### LLDA Topic Modeling via JGibbsLabledLDA ############################################## + +top_topic_words = 10 + + print("\n\n") start = time.time() @@ -933,7 +1132,7 @@ LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root) #create file textacy.fileio.write_file_lines(generate_labled_lines(de_corpus), filepath=LLDA_filepath) - +#todfo ticket drucken # wait for file to exist while not os.path.exists(LLDA_filepath): time.sleep(1) @@ -965,7 +1164,7 @@ end = time.time() printlog("\n\n\nTime Elapsed Topic Modeling JGibbsLLDA:{0} min\n\n".format((end - start)/60)) -""" + diff --git a/testra.py b/testra.py index a611a50..0be8eac 100644 --- a/testra.py +++ b/testra.py @@ -7,6 +7,7 @@ import textacy start = time.time() +import enchant from datetime import datetime @@ -15,12 +16,8 @@ import xml.etree.ElementTree as ET print(datetime.now()) - - -nomen=[] #PARSER=spacy.load("de") -#todo: thesaurus....yay... """ def normalizeSynonyms(default_return_first_Syn=False, parser=PARSER): @@ -67,7 +64,6 @@ def getHauptform(syn_block, word, default_return_first_Syn=False): ### extract from deWordNet.xml #https://github.com/hdaSprachtechnologie/odenet -#idee synsets bilden """ @@ -98,90 +94,46 @@ for r in root: """ -lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml" -synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml" - - -lextree = ET.parse(lexicalentries, ET.XMLParser(encoding="utf-8")) -syntree = ET.parse(synsets, ET.XMLParser(encoding="utf-8")) - -lexroot = lextree.getroot() -synroot = syntree.getroot() - - - -for r in synroot: - for element in r: - - if element.tag == "Synset": - sysnet = [] - attrib = element.attrib - id = attrib["id"] - - - for ro in lexroot: - for elem in ro: - if elem.tag == "LexicalEntry": - subs_dicts = [subentry.attrib for subentry in elem] - #: [{'partOfSpeech': 'n', 'writtenForm': 'Kernspaltung'}, {'synset': 'de-1-n', 'id': 'w1_1-n'}] - - dic = {k:v for x in subs_dicts for k,v in x.items()} # to one dict - if "synset" in dic.keys(): - if dic["synset"] == id: - - if id == "de-1004-n": - x = 0 - - string = (dic["writtenForm"]) - - # replaceRockDots - string = re.sub(r'[ß]', "ss", string) - string = re.sub(r'[ö]', "oe", string) - string = re.sub(r'[ü]', "ue", string) - string = re.sub(r'[ä]', "ae", string) - - - - # alle punkte raus - string = re.sub(r'[.]', "", string) - - - # alles in klammern raus - stringlist = string.split() - strings=[] - for w in stringlist: - if not bool(re.match(r'/\(([^)]+)\)/', w)): #todo funzt nich wie's soll - strings.append(w) - string = " ".join(strings) - - #re.sub(r'/\(([^)]+)\)/', " ", string) - - - sysnet.append(string.lower().strip()) - - - print(id,sysnet) - - - - - - - - - +import re +from collections import Counter +def words(text): return re.findall(r'\w+', text.lower()) +WORDS = Counter(words(open('/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt').read())) +def P(word, N=sum(WORDS.values())): + "Probability of `word`." + return WORDS[word] / N +def correction(word): + "Most probable spelling correction for word." + return max(candidates(word), key=P) +def candidates(word): + "Generate possible spelling corrections for word." + return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word]) +def known(words): + "The subset of `words` that appear in the dictionary of WORDS." + return set(w for w in words if w in WORDS) +def edits1(word): + "All edits that are one edit away from `word`." + letters = 'abcdefghijklmnopqrstuvwxyz' + splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] + deletes = [L + R[1:] for L, R in splits if R] + transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1] + replaces = [L + c + R[1:] for L, R in splits if R for c in letters] + inserts = [L + c + R for L, R in splits for c in letters] + return set(deletes + transposes + replaces + inserts) +def edits2(word): + "All edits that are two edits away from `word`." + return (e2 for e1 in edits1(word) for e2 in edits1(e1)) @@ -367,3 +319,5 @@ textacy.fileio.write_file_lines(de_stop_words,"german_stopwords.txt") end = time.time() print("\n\n\nTime Elapsed Topic:{0}\n\n".format(end - start)) + +