# -*- coding: utf-8 -*- from miscellaneous import * from stop_words import get_stop_words import csv import sys import xml.etree.ElementTree as ET from nltk.corpus import stopwords as nltk_stopwords from collections import Counter import time from datetime import datetime import os csv.field_size_limit(sys.maxsize) FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/" # load config config_ini = FILEPATH + "config.ini" config = ConfigParser.ConfigParser() with open(config_ini) as f: config.read_file(f) def create_lemma_dict(path2lemmalist): """ Creates a dict out of a txt file a la: l1 w1 l1 w2 l2 w1 l2 w2 Result will be used as lemma_dict[word] --> lemma :param path2lemmalist: str :return: dictionary """ file_gen = textacy.fileio.read_file_lines(path2lemmalist) lemmalist = list(map(textacy.preprocess.normalize_whitespace, list(file_gen))) lemma_dict = {} for line in lemmalist: lem_word_pair = line.split() lemma = lem_word_pair[0].strip().lower() word = lem_word_pair[1].strip().lower() lemma_dict[word] = lemma return lemma_dict def build_thesaurus_dict(path2wordnet,returnall=False): """ Creates a dict out of the deWordNet https://raw.githubusercontent.com/hdaSprachtechnologie/odenet/master/deWordNet.xml Result will be used as thesaurus[word] --> main_synonym :param path2wordnet: str :param returnall: bool if True, also return , word2synsets, synset2Words :return: dictionaries: thesaurus """ lextree = ET.parse(path2wordnet, ET.XMLParser(encoding="utf-8")) lexroot = lextree.getroot() # Build word2synsets word2synsets = {} template = {"w1": ["s1", "s2"]} for ro in lexroot: for elem in ro: if elem.tag == "LexicalEntry": lex_dictlist = [subentry.attrib for subentry in elem] # idee technischer thesaurus synlist = [] string = "WORD" for lex_dict in lex_dictlist: if "synset" in lex_dict.keys(): synset = lex_dict["synset"] synlist.append(synset) if 'writtenForm' in lex_dict.keys(): string = (lex_dict["writtenForm"]) # replaceRockDots string = re.sub(r'[ß]', "ss", string) string = re.sub(r'[ö]', "oe", string) string = re.sub(r'[Ö]', "Oe", string) string = re.sub(r'[ü]', "ue", string) string = re.sub(r'[Ü]', "Ue", string) string = re.sub(r'[ä]', "ae", string) string = re.sub(r'[Ä]', "ae", string) # alle punkte raus string = re.sub(r'[.]', "", string) # alles in klammern raus if "auptform" in string: string = re.sub(r"\((.*)\)", " ", string) string = string + " (hauptform)" # evtl. als hauptform merken else: string = re.sub(r"\((.*)\)", " ", string) # längeres leerzeichen normalisieren string = textacy.preprocess.normalize_whitespace(string) string = string.strip()#.lower() if string != '': word2synsets[string] = synlist # Build synset2Words synset2Words = {} template = {"s1": ["w1","w2"]} for word,synset in word2synsets.items(): if word != '': for syn in synset: if syn not in synset2Words.keys(): synset2Words[syn] = [word] else: synset2Words[syn].append(word) # Sortieren for words in synset2Words.values(): words.sort(key=lambda w: len(w.split())) # nach anzhal der wörter in den strings (weniger nach vorne) for w in words: if "(hauptform)" in w: to_insert = re.sub(r"\((.*)\)", " ", w).strip() words.remove(w) words.insert(0, to_insert) # Hauptform evtl. nach vorne thesaurus = {} thesaurus_template = {"w1" : "mainsyn"} # word --> [synset1, synset2, .. ] --> synset1 --> [syn1, syn2, ... ] --> syn1 / mainsyn for word,synsets in word2synsets.items(): #word , [synset1, synset2, .. ] try: if "Passwort" in word: x=2 first_synset = synsets[0] #erstes synset wählen . praktischer Grund syns = synset2Words[first_synset] # [syn1, syn2, ... ] first_syn = syns[0] # erstes synonym (evtl. Hauptform) wählen word = re.sub(r"\((.*)\)", " ", word).strip() #(hautpform weg) thesaurus[word] = first_syn #Ann.: erstes synonym ist das Hauptsynonym except: pass if returnall: return thesaurus, word2synsets, synset2Words else: return thesaurus def create_stopword_lists(*paths): """ creates a list of stoppwords from: spacy nltk stop_words :param paths: list of additional filepaths where each file looks like w1 w2 w3 filenames must be a la de_stopwords_1.txt, en_stopwords_2.txt :return: lists: de_stopwords, en_stopwords """ ## GERMAN # from packages de_stop_words1 = list(get_stop_words("de")) de_stop_words2 = list(nltk_stopwords.words('german')) de_stop_words3 = list(__import__("spacy.de", globals(), locals(), ['object']).STOP_WORDS) #from files de_filepaths = [] for path in paths: if os.path.basename(path).split("_")[0] == 'de' and os.path.basename(path).split("_")[ 1] == 'stopwords': de_filepaths.append(path) de_stop_words4 = list_from_files(*de_filepaths) #combine everything de_stop_words = list(set(map(replaceRockDots_lambda(), list(map(textacy.preprocess.normalize_whitespace, de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4))))) ## ENGLISH # from packages en_stop_words1 = list(get_stop_words("en")) en_stop_words2 = list(nltk_stopwords.words('english')) en_stop_words3 = list(__import__("spacy.en", globals(), locals(), ['object']).STOP_WORDS) # from files en_filepaths = [path for path in paths if os.path.basename(path).split("_")[0] == 'en' and os.path.basename(path).split("_")[ 1] == 'stopwords'] en_stop_words4 = list_from_files(*en_filepaths) # combine everything en_stop_words = list(set(map(replaceRockDots_lambda(), list(map(textacy.preprocess.normalize_whitespace, en_stop_words1 + en_stop_words2 + en_stop_words3 + en_stop_words4))))) return de_stop_words, en_stop_words def build_words_for_spellchecking(path2words): """ create word-Counter for spellchecking http://norvig.com/spell-correct.html http://wortschatz.uni-leipzig.de/en/download http://pcai056.informatik.uni-leipzig.de/downloads/corpora/deu_news_2015_1M.tar.gz :return: Counter """ def words(text): return re.findall(r'\w+', text.lower()) return Counter(words(open(path2words).read())) ################################################################################################## def main(): start = time.time() logprint("Init: {0}".format(datetime.now())) ressources_path = FILEPATH + "ressources/" # THESAURUS logprint("Build and save Thesaurus") path2wordnet = ressources_path + config.get("thesaurus", "input") thesaurus = build_thesaurus_dict(path2wordnet) path2thesaurus_dict = ressources_path + config.get("thesaurus", "pickle_file") save_obj(thesaurus, path2thesaurus_dict) # LEMMA logprint("create and save lemma_dict") path2lemma_file = ressources_path + config.get("lemmatization", "input") lemma_dict = create_lemma_dict(path2lemma_file) path2lemmadict = ressources_path + config.get("lemmatization", "pickle_file") save_obj(lemma_dict, path2lemmadict) # SPELLCHECKING logprint("Build and save Wordlist for Spellchecking") path2words_file = ressources_path + config.get("spellchecking", "input") words = build_words_for_spellchecking(path2words_file) path2words_counter = ressources_path + config.get("spellchecking", "pickle_file") save_obj(words, path2words_counter) # STOPWORDS logprint("Build and save stoppwortliste") stop1 = ressources_path + config.get("de_stopwords", "input1") stop2 = ressources_path + config.get("de_stopwords", "input2") stop3 = ressources_path + config.get("de_stopwords", "input3") de_stop_words, en_stop_words = create_stopword_lists(stop1, stop2, stop3) path2stopwordlist_de = ressources_path + config.get("de_stopwords", "pickle_file") save_obj(de_stop_words, path2stopwordlist_de) path2stopwordlist_en = ressources_path + config.get("en_stopwords", "pickle_file") save_obj(en_stop_words, path2stopwordlist_en) # NOMEN logprint("Build and save nomenliste") nouns0 = ressources_path + config.get("nouns", "input") nouns1 = ressources_path + config.get("nouns", "input1") nouns2 = ressources_path + config.get("nouns", "input2") nouns = list_from_files(nouns0,nouns1,nouns2) path2nouns_list = ressources_path + config.get("nouns", "pickle_file") save_obj(nouns, path2nouns_list) # VORNAMEN logprint("Build and save firstnameslist") firstnames_txt = ressources_path + config.get("firstnames", "input") vornamen = list_from_files(firstnames_txt) path2firstnameslist = ressources_path + config.get("firstnames", "pickle_file") save_obj(vornamen, path2firstnameslist) end = time.time() logprint("Time Elapsed Initialization:{0} min".format((end - start) / 60)) if __name__ == "__main__": main()