2017-10-11 17:16:04 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
from datetime import datetime
|
|
|
|
|
|
|
|
import time
|
|
|
|
import logging
|
2017-10-16 14:01:38 +02:00
|
|
|
from stop_words import get_stop_words
|
|
|
|
|
|
|
|
#import words as words
|
2017-10-11 17:16:04 +02:00
|
|
|
from nltk.corpus import stopwords as nltk_stopwords
|
|
|
|
from collections import Counter
|
|
|
|
import csv
|
|
|
|
import re
|
|
|
|
import xml.etree.ElementTree as ET
|
|
|
|
import spacy
|
|
|
|
import textacy
|
|
|
|
from scipy import *
|
|
|
|
import sys
|
|
|
|
csv.field_size_limit(sys.maxsize)
|
|
|
|
import pickle
|
2017-10-16 14:01:38 +02:00
|
|
|
import configparser as ConfigParser
|
|
|
|
from miscellaneous import *
|
2017-10-11 17:16:04 +02:00
|
|
|
|
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
# load config
|
2017-10-11 17:16:04 +02:00
|
|
|
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
|
|
|
|
|
|
|
|
config = ConfigParser.ConfigParser()
|
|
|
|
with open(config_ini) as f:
|
|
|
|
config.read_file(f)
|
|
|
|
|
|
|
|
|
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
def create_lemma_dict(path2lemmalist):
|
|
|
|
"""
|
|
|
|
Creates a dict out of a file a la:
|
2017-10-11 17:16:04 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
l1 w1
|
|
|
|
l1 w2
|
|
|
|
l2 w1
|
|
|
|
l2 w2
|
2017-10-11 17:16:04 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
Result will be used as lemma_dict["word"] --> lemma
|
2017-10-11 17:16:04 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
:param path2lemmalist: str
|
|
|
|
:return: dictionary
|
|
|
|
"""
|
|
|
|
lemmalist = list(map(textacy.preprocess.normalize_whitespace, list(
|
|
|
|
textacy.fileio.read_file_lines(path2lemmalist))))
|
2017-10-11 17:16:04 +02:00
|
|
|
|
|
|
|
lemma_dict = {}
|
|
|
|
|
|
|
|
for line in lemmalist:
|
|
|
|
lem_word_pair = line.split()
|
|
|
|
|
|
|
|
lemma = lem_word_pair[0].strip().lower()
|
|
|
|
|
|
|
|
word = lem_word_pair[1].strip().lower()
|
|
|
|
|
|
|
|
lemma_dict[word] = lemma
|
|
|
|
|
|
|
|
return lemma_dict
|
|
|
|
|
|
|
|
|
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
def build_thesaurus_dict(path2wordnet,returnall=False):
|
|
|
|
"""
|
|
|
|
Creates a dict out of the deWordNet
|
|
|
|
https://raw.githubusercontent.com/hdaSprachtechnologie/odenet/master/deWordNet.xml
|
2017-10-11 17:16:04 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
Result will be used as lemma_dict["word"] --> lemma
|
2017-10-12 15:57:56 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
:param path2lexicalentries: str
|
|
|
|
:param returnall: bool if True, also return , word2synsets, synset2Words
|
|
|
|
:return: dictionaries: thesaurus
|
|
|
|
"""
|
|
|
|
lextree = ET.parse(path2wordnet, ET.XMLParser(encoding="utf-8"))
|
2017-10-12 15:57:56 +02:00
|
|
|
|
|
|
|
lexroot = lextree.getroot()
|
|
|
|
|
|
|
|
word2synsets = {}
|
|
|
|
template = {"w1": ["s1", "s2"]}
|
|
|
|
|
|
|
|
for ro in lexroot:
|
|
|
|
for elem in ro:
|
|
|
|
if elem.tag == "LexicalEntry":
|
|
|
|
lex_dictlist = [subentry.attrib for subentry in elem]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
synlist = []
|
|
|
|
string = "WORD"
|
|
|
|
|
|
|
|
for lex_dict in lex_dictlist:
|
|
|
|
if "synset" in lex_dict.keys():
|
|
|
|
|
|
|
|
synset = lex_dict["synset"]
|
|
|
|
synlist.append(synset)
|
|
|
|
|
|
|
|
if 'writtenForm' in lex_dict.keys():
|
|
|
|
string = (lex_dict["writtenForm"])
|
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
if string == "Kennwort":
|
|
|
|
pass
|
|
|
|
|
2017-10-12 15:57:56 +02:00
|
|
|
# replaceRockDots
|
|
|
|
string = re.sub(r'[ß]', "ss", string)
|
|
|
|
string = re.sub(r'[ö]', "oe", string)
|
|
|
|
string = re.sub(r'[ü]', "ue", string)
|
|
|
|
string = re.sub(r'[ä]', "ae", string)
|
|
|
|
|
|
|
|
# alle punkte raus
|
|
|
|
string = re.sub(r'[.]', "", string)
|
|
|
|
|
|
|
|
# alles in klammern raus
|
|
|
|
string = re.sub(r"\((.*)\)", " ", string)
|
|
|
|
|
|
|
|
# längeres leerzeichen normalisieren
|
|
|
|
string = textacy.preprocess.normalize_whitespace(string)
|
|
|
|
|
|
|
|
string = string.lower().strip()
|
|
|
|
|
|
|
|
word2synsets[string] = synlist
|
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
|
2017-10-12 15:57:56 +02:00
|
|
|
synset2Words = {}
|
|
|
|
template = {"s1": ["w1","w2"]}
|
|
|
|
|
|
|
|
for word,synset in word2synsets.items():
|
2017-10-16 14:01:38 +02:00
|
|
|
if word != '':
|
|
|
|
for syn in synset:
|
|
|
|
if syn not in synset2Words.keys():
|
|
|
|
synset2Words[syn] = [word]
|
|
|
|
else:
|
|
|
|
synset2Words[syn].append(word)
|
2017-10-12 15:57:56 +02:00
|
|
|
|
|
|
|
# nach anzhal der wörter in den strings sortieren
|
|
|
|
for synset in word2synsets.values():
|
|
|
|
synset.sort(key=lambda x: len(x.split()))
|
|
|
|
|
|
|
|
thesaurus = {}
|
|
|
|
thesaurus_template = {"w1" : "mainsyn"}
|
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
|
2017-10-12 15:57:56 +02:00
|
|
|
for word,synset in word2synsets.items():
|
|
|
|
try:
|
2017-10-16 14:01:38 +02:00
|
|
|
thesaurus[word] = synset2Words[synset[0]][0] #Ann.: erstes synonym ist das Hauptsynonym #todo nach (hauptform) suchen?
|
2017-10-12 15:57:56 +02:00
|
|
|
except:
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
if returnall:
|
|
|
|
return thesaurus, word2synsets, synset2Words
|
|
|
|
else:
|
|
|
|
return thesaurus
|
2017-10-12 15:57:56 +02:00
|
|
|
|
|
|
|
|
2017-10-11 17:16:04 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
def create_stopword_lists(*paths):
|
|
|
|
"""
|
|
|
|
creates a list of stoppwords from:
|
|
|
|
spacy
|
|
|
|
nltk
|
|
|
|
stop_words
|
|
|
|
|
|
|
|
:param paths: list of additional filepaths where each file looks like
|
|
|
|
w1
|
|
|
|
w2
|
|
|
|
w3
|
|
|
|
filenames must be a la de_stopwords_1.txt, en_stopwords_2.txt
|
|
|
|
|
|
|
|
:return: lists: de_stopwords, en_stopwords
|
|
|
|
"""
|
2017-10-11 17:16:04 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
## GERMAN
|
2017-10-11 17:16:04 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
# from packages
|
|
|
|
de_stop_words1 = list(get_stop_words("de"))
|
2017-10-11 17:16:04 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
de_stop_words2 = list(nltk_stopwords.words('german'))
|
2017-10-11 17:16:04 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
de_stop_words3 = list(__import__("spacy.de", globals(), locals(), ['object']).STOP_WORDS)
|
2017-10-11 17:16:04 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
#from files
|
|
|
|
de_filepaths = []
|
|
|
|
for path in paths:
|
|
|
|
if os.path.basename(path).split("_")[0] == 'de' and os.path.basename(path).split("_")[
|
|
|
|
1] == 'stopwords':
|
|
|
|
de_filepaths.append(path)
|
2017-10-11 17:16:04 +02:00
|
|
|
|
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
de_stop_words4 = list_from_files(*de_filepaths)
|
2017-10-11 17:16:04 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
#combine everything
|
|
|
|
de_stop_words = list(set(map(replaceRockDots(), list(map(textacy.preprocess.normalize_whitespace,
|
|
|
|
de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4)))))
|
2017-10-11 17:16:04 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
## ENGLISH
|
2017-10-11 17:16:04 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
# from packages
|
|
|
|
en_stop_words1 = list(get_stop_words("en"))
|
2017-10-11 17:16:04 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
en_stop_words2 = list(nltk_stopwords.words('english'))
|
2017-10-11 17:16:04 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
en_stop_words3 = list(__import__("spacy.en", globals(), locals(), ['object']).STOP_WORDS)
|
2017-10-11 17:16:04 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
# from files
|
|
|
|
en_filepaths = [path for path in paths if
|
|
|
|
os.path.basename(path).split("_")[0] == 'en' and os.path.basename(path).split("_")[
|
|
|
|
1] == 'stopwords']
|
2017-10-11 17:16:04 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
en_stop_words4 = list_from_files(*en_filepaths)
|
2017-10-11 17:16:04 +02:00
|
|
|
|
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
# combine everything
|
|
|
|
en_stop_words = list(set(map(replaceRockDots(), list(map(textacy.preprocess.normalize_whitespace,
|
|
|
|
en_stop_words1 + en_stop_words2 + en_stop_words3 + en_stop_words4)))))
|
2017-10-11 17:16:04 +02:00
|
|
|
|
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
return de_stop_words, en_stop_words
|
2017-10-11 17:16:04 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
def build_words_for_spellchecking(path2words):
|
|
|
|
"""
|
|
|
|
create word-Counter for spellchecking
|
2017-10-11 17:16:04 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
http://norvig.com/spell-correct.html
|
|
|
|
http://wortschatz.uni-leipzig.de/en/download
|
2017-10-11 17:16:04 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
http://pcai056.informatik.uni-leipzig.de/downloads/corpora/deu_news_2015_1M.tar.gz
|
|
|
|
:return: Counter
|
|
|
|
"""
|
|
|
|
def words(text): return re.findall(r'\w+', text.lower())
|
2017-10-11 17:16:04 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
return Counter(words(open(path2words).read()))
|
2017-10-11 17:16:04 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
##################################################################################################
|
2017-10-11 17:16:04 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
# THESAURUS
|
|
|
|
path2wordnet = config.get("thesaurus","input")
|
|
|
|
path2thesaurus_dict = config.get("thesaurus","pickle_file")
|
2017-10-11 17:16:04 +02:00
|
|
|
|
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
# SPELLCHECKING
|
|
|
|
path2words_file = config.get("spellchecking","input")
|
|
|
|
path2wordlist = config.get("spellchecking","pickle_file")
|
2017-10-11 17:16:04 +02:00
|
|
|
|
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
# LEMMA
|
|
|
|
path2lemma_file = config.get("lemmatization","input")
|
|
|
|
path2lemmadict = config.get("lemmatization","pickle_file")
|
2017-10-11 17:16:04 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
# NOMEN
|
|
|
|
nouns1 = config.get("nouns","input1")
|
|
|
|
nouns2 = config.get("nouns","input2")
|
|
|
|
path2nouns_list = config.get("nouns","pickle_file")
|
2017-10-11 17:16:04 +02:00
|
|
|
|
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
# VORNAMEN
|
|
|
|
firstnames_txt = config.get("firstnames","input")
|
|
|
|
path2firstnameslist = config.get("firstnames","pickle_file")
|
2017-10-11 17:16:04 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
# STOPWORDS
|
|
|
|
stop1 = config.get("de_stopwords","input1")
|
|
|
|
stop2 = config.get("de_stopwords","input2")
|
|
|
|
stop3 = config.get("de_stopwords","input3")
|
|
|
|
path2stopwordlist = config.get("de_stopwords","pickle_file")
|
2017-10-11 17:16:04 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
def main():
|
|
|
|
start = time.time()
|
|
|
|
printlog("Init: {0}".format(datetime.now()))
|
2017-10-11 17:16:04 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
printlog("create and save lemma_dict")
|
|
|
|
lemma_dict = create_lemma_dict(path2lemma_file)
|
|
|
|
save_obj(lemma_dict, path2lemmadict)
|
2017-10-11 17:16:04 +02:00
|
|
|
|
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
printlog("Build and save Wordlist for Spellchecking")
|
|
|
|
words = build_words_for_spellchecking(path2words_file)
|
|
|
|
save_obj(words, path2wordlist)
|
2017-10-11 17:16:04 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
printlog("Build and save Thesaurus")
|
2017-10-16 14:01:38 +02:00
|
|
|
thesaurus = build_thesaurus_dict(path2wordnet)
|
|
|
|
save_obj(thesaurus, path2thesaurus_dict)
|
2017-10-11 17:16:04 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
printlog("Build and save stoppwortliste")
|
2017-10-16 14:01:38 +02:00
|
|
|
de_stop_words = create_stopword_lists(stop1, stop2, stop3)
|
2017-10-11 17:16:04 +02:00
|
|
|
save_obj(de_stop_words, path2stopwordlist)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
printlog("Build and save nomenliste")
|
2017-10-16 14:01:38 +02:00
|
|
|
nouns = list_from_files(nouns1,nouns2)
|
|
|
|
save_obj(nouns, path2nouns_list)
|
2017-10-11 17:16:04 +02:00
|
|
|
|
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
printlog("Build and save firstnameslist")
|
|
|
|
vornamen = list_from_files(firstnames_txt)
|
|
|
|
save_obj(vornamen, path2firstnameslist)
|
2017-10-11 17:16:04 +02:00
|
|
|
|
|
|
|
|
|
|
|
end = time.time()
|
2017-10-16 14:01:38 +02:00
|
|
|
printlog("Time Elapsed Initialization:{0} min".format((end - start) / 60))
|
2017-10-11 17:16:04 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|