# -*- coding: utf-8 -*- from datetime import datetime import time import logging from nltk.corpus import stopwords as nltk_stopwords from collections import Counter import csv import re import xml.etree.ElementTree as ET import spacy import textacy from scipy import * import sys csv.field_size_limit(sys.maxsize) import pickle # todo configuration file ? """ config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini" config = ConfigParser.ConfigParser() with open(config_ini) as f: config.read_file(f) """ # ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/init.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_init.log &" # config logging logfile = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log" logging.basicConfig(filename=logfile, level=logging.INFO) DE_PARSER = spacy.load("de") EN_PARSER = spacy.load("en") def replaceRockDots(): return lambda string: re.sub(r'[ß]', "ss", (re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower()))))))) def printlog(string, level="INFO"): """log and prints""" print(string) if level == "INFO": logging.info(string) elif level == "DEBUG": logging.debug(string) elif level == "WARNING": logging.warning(string) def save_obj(obj, path): with open(path + '.pkl', 'wb') as f: pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) def load_obj(path ): with open(path + '.pkl', 'rb') as f: return pickle.load(f) def create_lemma_dict(lemmalist): lemma_dict = {} for line in lemmalist: lem_word_pair = line.split() lemma = lem_word_pair[0].strip().lower() word = lem_word_pair[1].strip().lower() lemma_dict[word] = lemma return lemma_dict """ def build_thesaurus(path2lexicalentries, path2synsets): lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8")) syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8")) lexroot = lextree.getroot() synroot = syntree.getroot() thesaurus = [] for r in synroot: for element in r: if element.tag == "Synset": sysnet = [] attrib = element.attrib id = attrib["id"] for ro in lexroot: for elem in ro: if elem.tag == "LexicalEntry": subs_dicts = [subentry.attrib for subentry in elem] # : [{'partOfSpeech': 'n', 'writtenForm': 'Kernspaltung'}, {'synset': 'de-1-n', 'id': 'w1_1-n'}] dic = {k: v for x in subs_dicts for k, v in x.items()} # to one dict if "synset" in dic.keys(): if dic["synset"] == id: string = (dic["writtenForm"]) # replaceRockDots string = re.sub(r'[ß]', "ss", string) string = re.sub(r'[ö]', "oe", string) string = re.sub(r'[ü]', "ue", string) string = re.sub(r'[ä]', "ae", string) # alle punkte raus string = re.sub(r'[.]', "", string) # alles in klammern raus string = re.sub(r"\((.*)\)", " ", string) # längeres leerzeichen normalisieren string = textacy.preprocess.normalize_whitespace(string) sysnet.append(string.lower().strip()) # nach anzhal der wörter in den strings sortieren sysnet.sort(key=lambda x: len(x.split())) if len(sysnet) != 0: # todo warum sind manche leer? thesaurus.append(sysnet) return thesaurus #todo thesaurus in dictionary """ def build_thesaurus(path2lexicalentries):#, path2synsets): lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8")) #syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8")) lexroot = lextree.getroot() #synroot = syntree.getroot() word2synsets = {} template = {"w1": ["s1", "s2"]} for ro in lexroot: for elem in ro: if elem.tag == "LexicalEntry": lex_dictlist = [subentry.attrib for subentry in elem] synlist = [] string = "WORD" for lex_dict in lex_dictlist: if "synset" in lex_dict.keys(): synset = lex_dict["synset"] synlist.append(synset) if 'writtenForm' in lex_dict.keys(): string = (lex_dict["writtenForm"]) # replaceRockDots string = re.sub(r'[ß]', "ss", string) string = re.sub(r'[ö]', "oe", string) string = re.sub(r'[ü]', "ue", string) string = re.sub(r'[ä]', "ae", string) # alle punkte raus string = re.sub(r'[.]', "", string) # alles in klammern raus string = re.sub(r"\((.*)\)", " ", string) # längeres leerzeichen normalisieren string = textacy.preprocess.normalize_whitespace(string) string = string.lower().strip() word2synsets[string] = synlist synset2Words = {} template = {"s1": ["w1","w2"]} for word,synset in word2synsets.items(): for syn in synset: if syn not in synset2Words.keys(): synset2Words[syn] = [word] else: synset2Words[syn].append(word) # nach anzhal der wörter in den strings sortieren for synset in word2synsets.values(): synset.sort(key=lambda x: len(x.split())) thesaurus = {} thesaurus_template = {"w1" : "mainsyn"} for word,synset in word2synsets.items(): try: thesaurus[word] = synset2Words[synset[0]][0] #Ann.: erstes synonym ist das Hauptsynonym except: pass return thesaurus """ for r in synroot: for element in r: if element.tag == "Synset": synset = [] attrib = element.attrib id = attrib["id"] if id not in synset2Words.keys(): synset2Words[id] = "WORD" """ def create_stopwordlist(): de_stop_words1 = list(map(replaceRockDots(), list( map(textacy.preprocess.normalize_whitespace, textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stop_words.txt") ) ) ) ) de_stop_words2 = list(map(replaceRockDots(),list(set(nltk_stopwords.words('german'))))) de_stop_words3 = list(map(replaceRockDots(),list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS))) de_stop_words4 = list(map(replaceRockDots(),list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords-de.txt")))) de_stop_words = list(set(de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4)) return de_stop_words #todo en_stop_words= set(list(__import__("spacy." + EN_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)+ list(set(nltk_stopwords.words('english')))) ########################## Spellchecking ########################################## # http://norvig.com/spell-correct.html # http://wortschatz.uni-leipzig.de/en/download def words(text): return re.findall(r'\w+', text.lower()) ################################################################################################## # ziel: dictionaries für thesaurus, correctwordliste und lemmas als ladbare dateien # außerdem saubere stoppwortliste und nomenliste # THESAURUS lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml" #synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml" path2thesaurusdict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_list" # SPELLCHECKING path2words = '/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt' path2wordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_list" path2lemmadict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict" path2stopwordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list" path2NOUNSlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list" path2firstnameslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list" def main(): start = time.time() printlog("Init: {0}".format(datetime.now())) printlog("create and save lemma_dict") LEMMAS = list( textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmas.txt")) lemma_dict = create_lemma_dict(LEMMAS) save_obj(lemma_dict, path2lemmadict) printlog("Build and save Wordlist for Spellchecking") WORDS = Counter(words(open(path2words).read())) save_obj(WORDS, path2wordlist) printlog("Build and save Thesaurus") THESAURUS = build_thesaurus(path2lexicalentries=lexicalentries) save_obj(THESAURUS, path2thesaurusdict) printlog("Build and save stoppwortliste") de_stop_words = create_stopwordlist() save_obj(de_stop_words, path2stopwordlist) printlog("Build and save nomenliste") NOUNS = list(textacy.fileio.read_file_lines( "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen2.txt")) + list( textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen.txt")) NOUNS = list(map(textacy.preprocess.normalize_whitespace, NOUNS)) save_obj(NOUNS, path2NOUNSlist) printlog("Build and save fistnameslist") VORNAMEN = list(map(textacy.preprocess.normalize_whitespace, textacy.fileio.read_file_lines( "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames.txt"))) save_obj(VORNAMEN, path2firstnameslist) end = time.time() printlog("Time Elapsed Preprocessing:{0} min".format((end - start) / 60)) if __name__ == "__main__": main()