From 4fe12679fbe19274594199a6163c87e249fc272d Mon Sep 17 00:00:00 2001 From: "jannis.grundmann" Date: Thu, 12 Oct 2017 15:57:56 +0200 Subject: [PATCH] thesaurus erstellung luafzeit verbessert --- corporization.py | 47 ++++------------ init.py | 119 ++++++++++++++++++++++++++++++++++------ preprocessing.py | 140 +++++++++++++++++++++++++++++++++-------------- testra.py | 126 +++++++++++++++++++++++++++++++++++------- 4 files changed, 317 insertions(+), 115 deletions(-) diff --git a/corporization.py b/corporization.py index 00e958b..06e0bdb 100644 --- a/corporization.py +++ b/corporization.py @@ -33,7 +33,6 @@ path2en_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-E content_collumn_name = "Description" metaliste = [ - "TicketNumber", "Subject", "CreatedDate", @@ -46,15 +45,18 @@ metaliste = [ "Solution" ] + + + corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/" -corpus_name = "de_raw_corpus" +corpus_name = "de_raw_ticketCorpus" logfile = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log" -# todo configuration file ? +# todo configuration file """ config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini" @@ -98,7 +100,7 @@ def printRandomDoc(textacyCorpus): -def csv_to_textStream(path2csv: str, content_collumn_name: str): +def ticketcsv_to_textStream(path2csv: str, content_collumn_name: str): """ :param path2csv: string :param content_collumn_name: string @@ -117,7 +119,7 @@ def csv_to_textStream(path2csv: str, content_collumn_name: str): yield lst[content_collumn] -def csv_to_DictStream(path2csv: str, metalist: [str]): +def ticket_csv_to_DictStream(path2csv: str, metalist: [str]): """ :param path2csv: string :param metalist: list of strings @@ -155,7 +157,8 @@ def save_corpus(corpus, corpus_path, corpus_name, parser): """ # save parser - parser.save_to_directory(corpus_path + str(parser.lang) + '_parser') + parserpath = corpus_path + str(parser.lang) + '_parser' + parser.save_to_directory(parserpath) # save content contentpath = corpus_path + corpus_name + "_content.bin" @@ -171,34 +174,6 @@ def save_corpus(corpus, corpus_path, corpus_name, parser): -def cleanTextstream(textstream): - """ - :param textstream: string-gen - :param parser: spacy-parser - :yield: string-gen - """ - - for txt in textstream: - yield textacy.preprocess.normalize_whitespace(txt) - - -def cleanDictstream(dictstream): - """ - :param dictstream: dict-gen - :param parser: spacy-parser - :yield: dict-gen - """ - - for dic in dictstream: - - result = {} - - for key, value in dic.items(): - result[key] = textacy.preprocess.normalize_whitespace(value) - yield result - - - def main(): printlog("Corporization: {0}".format(datetime.now())) @@ -222,8 +197,8 @@ def main(): printlog("Add texts to textacy-corpus") de_corpus.add_texts( - cleanTextstream(csv_to_textStream(path2de_csv, content_collumn_name)), - cleanDictstream(csv_to_DictStream(path2de_csv, metaliste)) + ticketcsv_to_textStream(path2de_csv, content_collumn_name), + ticket_csv_to_DictStream(path2de_csv, metaliste) ) diff --git a/init.py b/init.py index 83f4ca1..e39417c 100644 --- a/init.py +++ b/init.py @@ -26,6 +26,7 @@ with open(config_ini) as f: config.read_file(f) """ +# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/init.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_init.log &" # config logging @@ -80,7 +81,7 @@ def create_lemma_dict(lemmalist): return lemma_dict - +""" def build_thesaurus(path2lexicalentries, path2synsets): lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8")) syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8")) @@ -134,6 +135,94 @@ def build_thesaurus(path2lexicalentries, path2synsets): return thesaurus #todo thesaurus in dictionary +""" + +def build_thesaurus(path2lexicalentries):#, path2synsets): + lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8")) + #syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8")) + + lexroot = lextree.getroot() + #synroot = syntree.getroot() + + + word2synsets = {} + template = {"w1": ["s1", "s2"]} + + for ro in lexroot: + for elem in ro: + if elem.tag == "LexicalEntry": + lex_dictlist = [subentry.attrib for subentry in elem] + + + + synlist = [] + string = "WORD" + + for lex_dict in lex_dictlist: + if "synset" in lex_dict.keys(): + + synset = lex_dict["synset"] + synlist.append(synset) + + if 'writtenForm' in lex_dict.keys(): + string = (lex_dict["writtenForm"]) + + # replaceRockDots + string = re.sub(r'[ß]', "ss", string) + string = re.sub(r'[ö]', "oe", string) + string = re.sub(r'[ü]', "ue", string) + string = re.sub(r'[ä]', "ae", string) + + # alle punkte raus + string = re.sub(r'[.]', "", string) + + # alles in klammern raus + string = re.sub(r"\((.*)\)", " ", string) + + # längeres leerzeichen normalisieren + string = textacy.preprocess.normalize_whitespace(string) + + string = string.lower().strip() + + word2synsets[string] = synlist + + synset2Words = {} + template = {"s1": ["w1","w2"]} + + for word,synset in word2synsets.items(): + for syn in synset: + if syn not in synset2Words.keys(): + synset2Words[syn] = [word] + else: + synset2Words[syn].append(word) + + # nach anzhal der wörter in den strings sortieren + for synset in word2synsets.values(): + synset.sort(key=lambda x: len(x.split())) + + thesaurus = {} + thesaurus_template = {"w1" : "mainsyn"} + + for word,synset in word2synsets.items(): + try: + thesaurus[word] = synset2Words[synset[0]][0] #Ann.: erstes synonym ist das Hauptsynonym + except: + pass + return thesaurus + + """ + for r in synroot: + for element in r: + + if element.tag == "Synset": + synset = [] + attrib = element.attrib + id = attrib["id"] + + if id not in synset2Words.keys(): + synset2Words[id] = "WORD" + """ + def create_stopwordlist(): @@ -151,7 +240,7 @@ def create_stopwordlist(): de_stop_words3 = list(map(replaceRockDots(),list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS))) - de_stop_words4 = list(map(replaceRockDots(),list(textacy.fileio.read_file_lines("stopwords-de.txt")))) + de_stop_words4 = list(map(replaceRockDots(),list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords-de.txt")))) de_stop_words = list(set(de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4)) @@ -172,34 +261,29 @@ def words(text): return re.findall(r'\w+', text.lower()) ################################################################################################## -# ziel: dictionaries für thesaurus, correctwordliste und lemmas als ladbare .json +# ziel: dictionaries für thesaurus, correctwordliste und lemmas als ladbare dateien # außerdem saubere stoppwortliste und nomenliste # THESAURUS lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml" -synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml" - -lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries_small.xml" -synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml" +#synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml" +path2thesaurusdict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_list" # SPELLCHECKING path2words = '/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt' +path2wordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_list" - - -path2lemmadict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict.pkl" -path2wordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_list.pkl" -path2thesauruslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_list.pkl" -path2stopwordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list.pkl" -path2NOUNSlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list.pkl" -path2firstnameslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list.pkl" +path2lemmadict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict" +path2stopwordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list" +path2NOUNSlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list" +path2firstnameslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list" @@ -235,11 +319,10 @@ def main(): printlog("Build and save Thesaurus") - THESAURUS = build_thesaurus(path2lexicalentries=lexicalentries, path2synsets=synsets) - print(THESAURUS[0:10]) + THESAURUS = build_thesaurus(path2lexicalentries=lexicalentries) - save_obj(THESAURUS, path2thesauruslist) + save_obj(THESAURUS, path2thesaurusdict) diff --git a/preprocessing.py b/preprocessing.py index 0bd5e73..6ae8ccc 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -38,6 +38,45 @@ csv.field_size_limit(sys.maxsize) +import pickle + +def save_obj(obj, path): + with open(path + '.pkl', 'wb') as f: + pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) + +def load_obj(path ): + with open(path + '.pkl', 'rb') as f: + return pickle.load(f) + + +def load_corpus(corpus_path, corpus_name, lang="de"): + + contentpath = corpus_path + corpus_name + "_content.bin" + metapath = corpus_path + corpus_name + "_meta.json" + + #load parser + parserpath = corpus_path + str(lang) + '_parser' + parser = spacy.load(parserpath) + + corpus = textacy.Corpus(parser) + + + metadata_stream = textacy.fileio.read_json_lines(metapath) + spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath) + for spacy_doc, metadata in zip(spacy_docs, metadata_stream): + corpus.add_doc( + textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata)) + return corpus + +corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/" +corpus_name = "de_raw_ticketCorpus" + +print(load_corpus(corpus_path,corpus_name)) + + + + + # ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/preprocessing.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_preprocessing.log &" @@ -63,36 +102,31 @@ logging.basicConfig(filename=logfile, level=logging.INFO) # THESAURUS +path2thesaurusdict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_list" +THESAURUS = load_obj(path2thesaurusdict) -# thesauruspath = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/openthesaurus.csv" -# thesauruspath = config.get("filepath","thesauruspath") -# THESAURUS = list(textacy.fileio.read_csv(thesauruspath, delimiter=";")) -lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml" -synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml" # SPELLCHECKING -path2words = '/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt' +path2wordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_list" +path2lemmadict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict" +path2stopwordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list" +path2NOUNSlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list" +path2firstnameslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list" + + + +# SPELLCHECKING + + +parser.save_to_directory(corpus_path + str(parser.lang) + '_parser') DE_PARSER = spacy.load("de") EN_PARSER = spacy.load("en") - -""" -de_stop_words= set( - list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS) + - list(textacy.fileio.read_file_lines("stopwords-de.txt")) -) - - -LEMMAS = list(textacy.fileio.read_file_lines(filepath="lemmatization-de.txt")) - -VORNAMEN = list(textacy.fileio.read_file_lines("vornamen.txt")) -""" - de_stop_words = list(map(textacy.preprocess.normalize_whitespace, textacy.fileio.read_file_lines( "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stop_words.txt"))) + list(set(stopwords.words('german'))) @@ -126,15 +160,7 @@ specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORE hardSFinder = re.compile(r'[ß]', re.IGNORECASE) -import pickle -def save_obj(obj, path): - with open(path + '.pkl', 'wb') as f: - pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) - -def load_obj(path ): - with open(path + '.pkl', 'rb') as f: - return pickle.load(f) def printlog(string, level="INFO"): """log and prints""" print(string) @@ -238,21 +264,6 @@ def csv_to_metaStream(path2csv: str, metalist: [str]): yield metadata -def save_corpus(corpus, corpus_path, corpus_name, parser=DE_PARSER): - - # save stringstore - stringstore_path = corpus_path + corpus_name + '_strings.json' - with open(stringstore_path, "w") as file: - parser.vocab.strings.dump(file) - - # save content - contentpath = corpus_path + corpus_name + "_content.bin" - textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus), contentpath) - - # save meta - metapath = corpus_path + corpus_name + "_meta.json" - textacy.fileio.write_json_lines((doc.metadata for doc in corpus), metapath) - ############# filter tokens @@ -751,6 +762,51 @@ def filterTokens(tokens, funclist): return tokens +def cleanString(string): + # replaceRockDots + string = re.sub(r'[ß]', "ss", string) + string = re.sub(r'[ö]', "oe", string) + string = re.sub(r'[ü]', "ue", string) + string = re.sub(r'[ä]', "ae", string) + + + # längeres leerzeichen normalisieren + string = textacy.preprocess.normalize_whitespace(string) + + return(string) + +def normalizeTextStream(textstream,clean=False): + """ + :param textstream: string-gen + :param parser: spacy-parser + :yield: string-gen + """ + + for txt in textstream: + if clean: + yield cleanString(txt) + else: + yield textacy.preprocess.normalize_whitespace(txt) + +def nomalizeDictstream(dictstream, clean=False): + """ + :param dictstream: dict-gen + :param parser: spacy-parser + :yield: dict-gen + """ + + for dic in dictstream: + + result = {} + + for key, value in dic.items(): + if clean: + result[key] = cleanString(value) + else: + result[key] = textacy.preprocess.normalize_whitespace(value) + yield result + + custom_words = ["geehrt", "dame", "herr", "hilfe", "problem", "lauten", "bedanken", "voraus", "hallo", "gerne", "freundlich", "fragen", "fehler", "bitten", "ehre", "lieb", "helfen", diff --git a/testra.py b/testra.py index f7398da..c13dcf2 100644 --- a/testra.py +++ b/testra.py @@ -5,6 +5,7 @@ import json import spacy import textacy +from functools import reduce start = time.time() @@ -52,6 +53,8 @@ corpus.add_texts( print(corpus) """ + + import pickle def save_obj(obj, path): @@ -63,31 +66,122 @@ def load_obj(path ): return pickle.load(f) +# THESAURUS +lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries_small.xml" +lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml" +synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml" -lemmalist = list(map(textacy.preprocess.normalize_whitespace, - list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmas.txt")))) -lemma_dict = {} +def build_thesaurus(path2lexicalentries):#, path2synsets): + lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8")) + #syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8")) -for line in lemmalist: + lexroot = lextree.getroot() + #synroot = syntree.getroot() - lem_word_pair = line.split() - lemma = lem_word_pair[0].strip().lower() + word2synsets = {} + template = {"w1": ["s1", "s2"]} - word = lem_word_pair[1].strip().lower() - - lemma_dict[word] = lemma + for ro in lexroot: + for elem in ro: + if elem.tag == "LexicalEntry": + lex_dictlist = [subentry.attrib for subentry in elem] -print(lemma_dict["abbekomme"]) + synlist = [] + string = "WORD" -save_obj(lemma_dict, "test_dictionies") + for lex_dict in lex_dictlist: + if "synset" in lex_dict.keys(): -loaded = load_obj("test_dictionies") + synset = lex_dict["synset"] + synlist.append(synset) -print(loaded["abbekomme"]) + if 'writtenForm' in lex_dict.keys(): + string = (lex_dict["writtenForm"]) + + # replaceRockDots + string = re.sub(r'[ß]', "ss", string) + string = re.sub(r'[ö]', "oe", string) + string = re.sub(r'[ü]', "ue", string) + string = re.sub(r'[ä]', "ae", string) + + # alle punkte raus + string = re.sub(r'[.]', "", string) + + # alles in klammern raus + string = re.sub(r"\((.*)\)", " ", string) + + # längeres leerzeichen normalisieren + string = textacy.preprocess.normalize_whitespace(string) + + string = string.lower().strip() + + word2synsets[string] = synlist + + synset2Words = {} + template = {"s1": ["w1","w2"]} + + for word,synset in word2synsets.items(): + for syn in synset: + if syn not in synset2Words.keys(): + synset2Words[syn] = [word] + else: + synset2Words[syn].append(word) + + # nach anzhal der wörter in den strings sortieren + for synset in word2synsets.values(): + synset.sort(key=lambda x: len(x.split())) + + thesaurus = {} + thesaurus_template = {"w1" : "mainsyn"} + + for word,synset in word2synsets.items(): + try: + thesaurus[word] = synset2Words[synset[0]][0] #Ann.: erstes synonym ist das Hauptsynonym + except: + pass + return thesaurus + + """ + for r in synroot: + for element in r: + + if element.tag == "Synset": + synset = [] + attrib = element.attrib + id = attrib["id"] + + if id not in synset2Words.keys(): + synset2Words[id] = "WORD" + """ + + +def load_corpus(corpus_path, corpus_name, lang="de"): + contentpath = corpus_path + corpus_name + "_content.bin" + metapath = corpus_path + corpus_name + "_meta.json" + + # load parser + parserpath = corpus_path + str(lang) + '_parser' + parser = spacy.load(parserpath) + + corpus = textacy.Corpus(parser) + + metadata_stream = textacy.fileio.read_json_lines(metapath) + spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath) + for spacy_doc, metadata in zip(spacy_docs, metadata_stream): + corpus.add_doc( + textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata)) + return corpus + +#todo load corpus from file idee stringstore und vocab laden + +corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/" +corpus_name = "de_raw_ticketCorpus" + +print(load_corpus(corpus_path, corpus_name)) """ from postal.parser import parse_address @@ -101,12 +195,6 @@ address = "Technische Universität Dortmund Maschinenbau/Lehrstuhl für Förder- print(parse_address(address)) """ - - - - - - """ corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"