diff --git a/corporization.py b/corporization.py new file mode 100644 index 0000000..00e958b --- /dev/null +++ b/corporization.py @@ -0,0 +1,251 @@ +# -*- coding: utf-8 -*- + +import time + + +from datetime import datetime +import logging +from nltk.corpus import stopwords +import csv +import functools +import re +import xml.etree.ElementTree as ET +import spacy +import textacy +from scipy import * +import sys +csv.field_size_limit(sys.maxsize) + + + + + +# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corporization.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_corporization.log &" + +path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv" +path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv" +#path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_mini.csv" +path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv" + +path2en_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/en_tickets.csv" + + +content_collumn_name = "Description" + +metaliste = [ + + "TicketNumber", + "Subject", + "CreatedDate", + "categoryName", + "Impact", + "Urgency", + "BenutzerID", + "VerantwortlicherID", + "EigentuemerID", + "Solution" +] + +corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/" +corpus_name = "de_raw_corpus" + +logfile = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log" + + + + +# todo configuration file ? +""" +config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini" + +config = ConfigParser.ConfigParser() +with open(config_ini) as f: + config.read_file(f) +""" + + + +# config logging +logging.basicConfig(filename=logfile, level=logging.INFO) +# logging.basicConfig(filename=config.get("filepath","logfile"), level=logging.INFO) + + + + + +def printlog(string, level="INFO"): + """log and prints""" + print(string) + if level == "INFO": + logging.info(string) + elif level == "DEBUG": + logging.debug(string) + elif level == "WARNING": + logging.warning(string) + + + +def printRandomDoc(textacyCorpus): + import random + print() + + printlog("len(textacyCorpus) = %i" % len(textacyCorpus)) + randIndex = int((len(textacyCorpus) - 1) * random.random()) + printlog("Index: {0} ; Text: {1} ; Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text, + textacyCorpus[randIndex].metadata)) + + print() + + + +def csv_to_textStream(path2csv: str, content_collumn_name: str): + """ + :param path2csv: string + :param content_collumn_name: string + :return: string-generator + """ + stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8') + content_collumn = 0 # standardvalue + + for i, lst in enumerate(stream): + if i == 0: + # look for desired column + for j, col in enumerate(lst): + if col == content_collumn_name: + content_collumn = j + else: + yield lst[content_collumn] + + +def csv_to_DictStream(path2csv: str, metalist: [str]): + """ + :param path2csv: string + :param metalist: list of strings + :return: dict-generator + """ + stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8') + + content_collumn = 0 # standardvalue + metaindices = [] + metadata_temp = {} + for i, lst in enumerate(stream): + if i == 0: + for j, col in enumerate(lst): # geht bestimmt effizienter... egal, weil passiert nur einmal + for key in metalist: + if re.sub('[^a-zA-Z]+', '', key) == re.sub('[^a-zA-Z]+', '', col): + metaindices.append(j) + metadata_temp = dict( + zip(metalist, metaindices)) # zB {'Subject' : 1, 'categoryName' : 3, 'Solution' : 10} + + else: + metadata = metadata_temp.copy() + for key, value in metadata.items(): + metadata[key] = lst[value] + yield metadata + + +def save_corpus(corpus, corpus_path, corpus_name, parser): + """ + # save stringstore + stringstore_path = corpus_path + corpus_name + '_strings.json' + with open(stringstore_path, "w") as file: + parser.vocab.strings.dump(file) + + #todo save vocab? + """ + + # save parser + parser.save_to_directory(corpus_path + str(parser.lang) + '_parser') + + # save content + contentpath = corpus_path + corpus_name + "_content.bin" + textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus), contentpath) + + # save meta + metapath = corpus_path + corpus_name + "_meta.json" + textacy.fileio.write_json_lines((doc.metadata for doc in corpus), metapath) + + + +################################################################################################## + + + +def cleanTextstream(textstream): + """ + :param textstream: string-gen + :param parser: spacy-parser + :yield: string-gen + """ + + for txt in textstream: + yield textacy.preprocess.normalize_whitespace(txt) + + +def cleanDictstream(dictstream): + """ + :param dictstream: dict-gen + :param parser: spacy-parser + :yield: dict-gen + """ + + for dic in dictstream: + + result = {} + + for key, value in dic.items(): + result[key] = textacy.preprocess.normalize_whitespace(value) + yield result + + + +def main(): + + printlog("Corporization: {0}".format(datetime.now())) + + path_csv_split = path2de_csv.split("/") + printlog(path_csv_split[len(path_csv_split) - 1]) + path_csv_split = path2en_csv.split("/") + printlog(path_csv_split[len(path_csv_split) - 1]) + + start = time.time() + + DE_PARSER = spacy.load("de") + EN_PARSER = spacy.load("en") + + de_corpus = textacy.Corpus(DE_PARSER) + en_corpus = textacy.Corpus(EN_PARSER) + + + + ## add files to textacy-corpus, + printlog("Add texts to textacy-corpus") + + de_corpus.add_texts( + cleanTextstream(csv_to_textStream(path2de_csv, content_collumn_name)), + cleanDictstream(csv_to_DictStream(path2de_csv, metaliste)) + ) + + + # leere docs aus corpus kicken + de_corpus.remove(lambda doc: len(doc) == 0) + + + for i in range(20): + printRandomDoc(de_corpus) + + + #save corpus + + save_corpus(corpus=de_corpus,corpus_path=corpus_path,corpus_name=corpus_name,parser=DE_PARSER) + + #todo das selbe mit en_corpus + + + + end = time.time() + printlog("Time Elapsed Corporization:{0} min".format((end - start) / 60)) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/init.py b/init.py new file mode 100644 index 0000000..83f4ca1 --- /dev/null +++ b/init.py @@ -0,0 +1,286 @@ +# -*- coding: utf-8 -*- + +from datetime import datetime + +import time +import logging +from nltk.corpus import stopwords as nltk_stopwords +from collections import Counter +import csv +import re +import xml.etree.ElementTree as ET +import spacy +import textacy +from scipy import * +import sys +csv.field_size_limit(sys.maxsize) +import pickle + + +# todo configuration file ? +""" +config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini" + +config = ConfigParser.ConfigParser() +with open(config_ini) as f: + config.read_file(f) +""" + + + +# config logging +logfile = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log" +logging.basicConfig(filename=logfile, level=logging.INFO) + + + + +DE_PARSER = spacy.load("de") +EN_PARSER = spacy.load("en") + + + +def replaceRockDots(): + return lambda string: re.sub(r'[ß]', "ss", + (re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower()))))))) + +def printlog(string, level="INFO"): + """log and prints""" + print(string) + if level == "INFO": + logging.info(string) + elif level == "DEBUG": + logging.debug(string) + elif level == "WARNING": + logging.warning(string) + + + + +def save_obj(obj, path): + with open(path + '.pkl', 'wb') as f: + pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) + +def load_obj(path ): + with open(path + '.pkl', 'rb') as f: + return pickle.load(f) + +def create_lemma_dict(lemmalist): + + lemma_dict = {} + + for line in lemmalist: + lem_word_pair = line.split() + + lemma = lem_word_pair[0].strip().lower() + + word = lem_word_pair[1].strip().lower() + + lemma_dict[word] = lemma + + return lemma_dict + + +def build_thesaurus(path2lexicalentries, path2synsets): + lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8")) + syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8")) + + lexroot = lextree.getroot() + synroot = syntree.getroot() + + thesaurus = [] + + for r in synroot: + for element in r: + + if element.tag == "Synset": + sysnet = [] + attrib = element.attrib + id = attrib["id"] + + for ro in lexroot: + for elem in ro: + if elem.tag == "LexicalEntry": + subs_dicts = [subentry.attrib for subentry in elem] + # : [{'partOfSpeech': 'n', 'writtenForm': 'Kernspaltung'}, {'synset': 'de-1-n', 'id': 'w1_1-n'}] + + dic = {k: v for x in subs_dicts for k, v in x.items()} # to one dict + if "synset" in dic.keys(): + if dic["synset"] == id: + string = (dic["writtenForm"]) + + # replaceRockDots + string = re.sub(r'[ß]', "ss", string) + string = re.sub(r'[ö]', "oe", string) + string = re.sub(r'[ü]', "ue", string) + string = re.sub(r'[ä]', "ae", string) + + # alle punkte raus + string = re.sub(r'[.]', "", string) + + # alles in klammern raus + string = re.sub(r"\((.*)\)", " ", string) + + # längeres leerzeichen normalisieren + string = textacy.preprocess.normalize_whitespace(string) + + sysnet.append(string.lower().strip()) + + # nach anzhal der wörter in den strings sortieren + sysnet.sort(key=lambda x: len(x.split())) + if len(sysnet) != 0: + # todo warum sind manche leer? + thesaurus.append(sysnet) + return thesaurus + + #todo thesaurus in dictionary + + +def create_stopwordlist(): + + de_stop_words1 = list(map(replaceRockDots(), + list( + map(textacy.preprocess.normalize_whitespace, + textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stop_words.txt") + ) + ) + ) + ) + + de_stop_words2 = list(map(replaceRockDots(),list(set(nltk_stopwords.words('german'))))) + + de_stop_words3 = list(map(replaceRockDots(),list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS))) + + de_stop_words4 = list(map(replaceRockDots(),list(textacy.fileio.read_file_lines("stopwords-de.txt")))) + + de_stop_words = list(set(de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4)) + + return de_stop_words + + #todo en_stop_words= set(list(__import__("spacy." + EN_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)+ list(set(nltk_stopwords.words('english')))) + + + + + +########################## Spellchecking ########################################## +# http://norvig.com/spell-correct.html +# http://wortschatz.uni-leipzig.de/en/download + +def words(text): return re.findall(r'\w+', text.lower()) + + +################################################################################################## + +# ziel: dictionaries für thesaurus, correctwordliste und lemmas als ladbare .json +# außerdem saubere stoppwortliste und nomenliste + + + +# THESAURUS +lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml" +synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml" + +lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries_small.xml" +synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml" + + + +# SPELLCHECKING +path2words = '/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt' + + + + + + +path2lemmadict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict.pkl" +path2wordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_list.pkl" +path2thesauruslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_list.pkl" +path2stopwordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list.pkl" +path2NOUNSlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list.pkl" +path2firstnameslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list.pkl" + + + + + + + +def main(): + start = time.time() + printlog("Init: {0}".format(datetime.now())) + + + + + printlog("create and save lemma_dict") + LEMMAS = list( + textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmas.txt")) + + lemma_dict = create_lemma_dict(LEMMAS) + save_obj(lemma_dict, path2lemmadict) + + + + + + + printlog("Build and save Wordlist for Spellchecking") + WORDS = Counter(words(open(path2words).read())) + save_obj(WORDS, path2wordlist) + + + + + + printlog("Build and save Thesaurus") + THESAURUS = build_thesaurus(path2lexicalentries=lexicalentries, path2synsets=synsets) + print(THESAURUS[0:10]) + + + save_obj(THESAURUS, path2thesauruslist) + + + + + + printlog("Build and save stoppwortliste") + de_stop_words = create_stopwordlist() + save_obj(de_stop_words, path2stopwordlist) + + + + + + + printlog("Build and save nomenliste") + NOUNS = list(textacy.fileio.read_file_lines( + "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen2.txt")) + list( + textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen.txt")) + NOUNS = list(map(textacy.preprocess.normalize_whitespace, NOUNS)) + save_obj(NOUNS, path2NOUNSlist) + + + + + + + printlog("Build and save fistnameslist") + VORNAMEN = list(map(textacy.preprocess.normalize_whitespace, textacy.fileio.read_file_lines( + "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames.txt"))) + + save_obj(VORNAMEN, path2firstnameslist) + + + + + + + end = time.time() + printlog("Time Elapsed Preprocessing:{0} min".format((end - start) / 60)) + + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/preprocessing.py b/preprocessing.py index cc8d3ba..0bd5e73 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -12,6 +12,8 @@ path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-E path2en_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/en_tickets.csv" +#idee roh-corpus (nur whitespace weg) speichern -> pregeprocesster corpus -> damit arbeiten + path_csv_split = path2de_csv.split("/") print(path_csv_split[len(path_csv_split) - 1]) @@ -124,7 +126,15 @@ specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORE hardSFinder = re.compile(r'[ß]', re.IGNORECASE) +import pickle +def save_obj(obj, path): + with open(path + '.pkl', 'wb') as f: + pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) + +def load_obj(path ): + with open(path + '.pkl', 'rb') as f: + return pickle.load(f) def printlog(string, level="INFO"): """log and prints""" print(string) diff --git a/testra.py b/testra.py index 13de581..f7398da 100644 --- a/testra.py +++ b/testra.py @@ -52,8 +52,44 @@ corpus.add_texts( print(corpus) """ +import pickle + +def save_obj(obj, path): + with open(path + '.pkl', 'wb') as f: + pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) + +def load_obj(path ): + with open(path + '.pkl', 'rb') as f: + return pickle.load(f) + +lemmalist = list(map(textacy.preprocess.normalize_whitespace, + list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmas.txt")))) + +lemma_dict = {} + +for line in lemmalist: + + lem_word_pair = line.split() + + lemma = lem_word_pair[0].strip().lower() + + word = lem_word_pair[1].strip().lower() + + lemma_dict[word] = lemma + + + +print(lemma_dict["abbekomme"]) + +save_obj(lemma_dict, "test_dictionies") + +loaded = load_obj("test_dictionies") + +print(loaded["abbekomme"]) + +""" from postal.parser import parse_address @@ -63,7 +99,7 @@ print(parse_address(address)) address = "Technische Universität Dortmund Maschinenbau/Lehrstuhl für Förder- und Lagerwesen LogistikCampus Joseph-von-Fraunhofer-Str. 2-4 D-44227 Dortmund " print(parse_address(address)) - +"""