369 lines
11 KiB
Python
369 lines
11 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
from datetime import datetime
|
|
|
|
import time
|
|
import logging
|
|
from nltk.corpus import stopwords as nltk_stopwords
|
|
from collections import Counter
|
|
import csv
|
|
import re
|
|
import xml.etree.ElementTree as ET
|
|
import spacy
|
|
import textacy
|
|
from scipy import *
|
|
import sys
|
|
csv.field_size_limit(sys.maxsize)
|
|
import pickle
|
|
|
|
|
|
# todo configuration file ?
|
|
"""
|
|
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
|
|
|
|
config = ConfigParser.ConfigParser()
|
|
with open(config_ini) as f:
|
|
config.read_file(f)
|
|
"""
|
|
|
|
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/init.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_init.log &"
|
|
|
|
|
|
# config logging
|
|
logfile = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log"
|
|
logging.basicConfig(filename=logfile, level=logging.INFO)
|
|
|
|
|
|
|
|
|
|
DE_PARSER = spacy.load("de")
|
|
EN_PARSER = spacy.load("en")
|
|
|
|
|
|
|
|
def replaceRockDots():
|
|
return lambda string: re.sub(r'[ß]', "ss",
|
|
(re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower())))))))
|
|
|
|
def printlog(string, level="INFO"):
|
|
"""log and prints"""
|
|
print(string)
|
|
if level == "INFO":
|
|
logging.info(string)
|
|
elif level == "DEBUG":
|
|
logging.debug(string)
|
|
elif level == "WARNING":
|
|
logging.warning(string)
|
|
|
|
|
|
|
|
|
|
def save_obj(obj, path):
|
|
with open(path + '.pkl', 'wb') as f:
|
|
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
|
|
|
|
def load_obj(path ):
|
|
with open(path + '.pkl', 'rb') as f:
|
|
return pickle.load(f)
|
|
|
|
def create_lemma_dict(lemmalist):
|
|
|
|
lemma_dict = {}
|
|
|
|
for line in lemmalist:
|
|
lem_word_pair = line.split()
|
|
|
|
lemma = lem_word_pair[0].strip().lower()
|
|
|
|
word = lem_word_pair[1].strip().lower()
|
|
|
|
lemma_dict[word] = lemma
|
|
|
|
return lemma_dict
|
|
|
|
"""
|
|
def build_thesaurus(path2lexicalentries, path2synsets):
|
|
lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
|
|
syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
|
|
|
|
lexroot = lextree.getroot()
|
|
synroot = syntree.getroot()
|
|
|
|
thesaurus = []
|
|
|
|
for r in synroot:
|
|
for element in r:
|
|
|
|
if element.tag == "Synset":
|
|
sysnet = []
|
|
attrib = element.attrib
|
|
id = attrib["id"]
|
|
|
|
for ro in lexroot:
|
|
for elem in ro:
|
|
if elem.tag == "LexicalEntry":
|
|
subs_dicts = [subentry.attrib for subentry in elem]
|
|
# <class 'list'>: [{'partOfSpeech': 'n', 'writtenForm': 'Kernspaltung'}, {'synset': 'de-1-n', 'id': 'w1_1-n'}]
|
|
|
|
dic = {k: v for x in subs_dicts for k, v in x.items()} # to one dict
|
|
if "synset" in dic.keys():
|
|
if dic["synset"] == id:
|
|
string = (dic["writtenForm"])
|
|
|
|
# replaceRockDots
|
|
string = re.sub(r'[ß]', "ss", string)
|
|
string = re.sub(r'[ö]', "oe", string)
|
|
string = re.sub(r'[ü]', "ue", string)
|
|
string = re.sub(r'[ä]', "ae", string)
|
|
|
|
# alle punkte raus
|
|
string = re.sub(r'[.]', "", string)
|
|
|
|
# alles in klammern raus
|
|
string = re.sub(r"\((.*)\)", " ", string)
|
|
|
|
# längeres leerzeichen normalisieren
|
|
string = textacy.preprocess.normalize_whitespace(string)
|
|
|
|
sysnet.append(string.lower().strip())
|
|
|
|
# nach anzhal der wörter in den strings sortieren
|
|
sysnet.sort(key=lambda x: len(x.split()))
|
|
if len(sysnet) != 0:
|
|
# todo warum sind manche leer?
|
|
thesaurus.append(sysnet)
|
|
return thesaurus
|
|
|
|
#todo thesaurus in dictionary
|
|
"""
|
|
|
|
def build_thesaurus(path2lexicalentries):#, path2synsets):
|
|
lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
|
|
#syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
|
|
|
|
lexroot = lextree.getroot()
|
|
#synroot = syntree.getroot()
|
|
|
|
|
|
word2synsets = {}
|
|
template = {"w1": ["s1", "s2"]}
|
|
|
|
for ro in lexroot:
|
|
for elem in ro:
|
|
if elem.tag == "LexicalEntry":
|
|
lex_dictlist = [subentry.attrib for subentry in elem]
|
|
|
|
|
|
|
|
synlist = []
|
|
string = "WORD"
|
|
|
|
for lex_dict in lex_dictlist:
|
|
if "synset" in lex_dict.keys():
|
|
|
|
synset = lex_dict["synset"]
|
|
synlist.append(synset)
|
|
|
|
if 'writtenForm' in lex_dict.keys():
|
|
string = (lex_dict["writtenForm"])
|
|
|
|
# replaceRockDots
|
|
string = re.sub(r'[ß]', "ss", string)
|
|
string = re.sub(r'[ö]', "oe", string)
|
|
string = re.sub(r'[ü]', "ue", string)
|
|
string = re.sub(r'[ä]', "ae", string)
|
|
|
|
# alle punkte raus
|
|
string = re.sub(r'[.]', "", string)
|
|
|
|
# alles in klammern raus
|
|
string = re.sub(r"\((.*)\)", " ", string)
|
|
|
|
# längeres leerzeichen normalisieren
|
|
string = textacy.preprocess.normalize_whitespace(string)
|
|
|
|
string = string.lower().strip()
|
|
|
|
word2synsets[string] = synlist
|
|
|
|
synset2Words = {}
|
|
template = {"s1": ["w1","w2"]}
|
|
|
|
for word,synset in word2synsets.items():
|
|
for syn in synset:
|
|
if syn not in synset2Words.keys():
|
|
synset2Words[syn] = [word]
|
|
else:
|
|
synset2Words[syn].append(word)
|
|
|
|
# nach anzhal der wörter in den strings sortieren
|
|
for synset in word2synsets.values():
|
|
synset.sort(key=lambda x: len(x.split()))
|
|
|
|
thesaurus = {}
|
|
thesaurus_template = {"w1" : "mainsyn"}
|
|
|
|
for word,synset in word2synsets.items():
|
|
try:
|
|
thesaurus[word] = synset2Words[synset[0]][0] #Ann.: erstes synonym ist das Hauptsynonym
|
|
except:
|
|
pass
|
|
return thesaurus
|
|
|
|
"""
|
|
for r in synroot:
|
|
for element in r:
|
|
|
|
if element.tag == "Synset":
|
|
synset = []
|
|
attrib = element.attrib
|
|
id = attrib["id"]
|
|
|
|
if id not in synset2Words.keys():
|
|
synset2Words[id] = "WORD"
|
|
"""
|
|
|
|
|
|
|
|
def create_stopwordlist():
|
|
|
|
de_stop_words1 = list(map(replaceRockDots(),
|
|
list(
|
|
map(textacy.preprocess.normalize_whitespace,
|
|
textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stop_words.txt")
|
|
)
|
|
)
|
|
)
|
|
)
|
|
|
|
de_stop_words2 = list(map(replaceRockDots(),list(set(nltk_stopwords.words('german')))))
|
|
|
|
de_stop_words3 = list(map(replaceRockDots(),list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)))
|
|
|
|
de_stop_words4 = list(map(replaceRockDots(),list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords-de.txt"))))
|
|
|
|
de_stop_words = list(set(de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4))
|
|
|
|
return de_stop_words
|
|
|
|
#todo en_stop_words= set(list(__import__("spacy." + EN_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)+ list(set(nltk_stopwords.words('english'))))
|
|
|
|
|
|
|
|
|
|
|
|
########################## Spellchecking ##########################################
|
|
# http://norvig.com/spell-correct.html
|
|
# http://wortschatz.uni-leipzig.de/en/download
|
|
|
|
def words(text): return re.findall(r'\w+', text.lower())
|
|
|
|
|
|
##################################################################################################
|
|
|
|
# ziel: dictionaries für thesaurus, correctwordliste und lemmas als ladbare dateien
|
|
# außerdem saubere stoppwortliste und nomenliste
|
|
|
|
|
|
|
|
# THESAURUS
|
|
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
|
|
#synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
|
|
path2thesaurusdict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_list"
|
|
|
|
|
|
|
|
# SPELLCHECKING
|
|
path2words = '/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt'
|
|
path2wordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_list"
|
|
|
|
|
|
|
|
|
|
path2lemmadict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict"
|
|
path2stopwordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list"
|
|
path2NOUNSlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list"
|
|
path2firstnameslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
start = time.time()
|
|
printlog("Init: {0}".format(datetime.now()))
|
|
|
|
|
|
|
|
|
|
printlog("create and save lemma_dict")
|
|
LEMMAS = list(
|
|
textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmas.txt"))
|
|
|
|
lemma_dict = create_lemma_dict(LEMMAS)
|
|
save_obj(lemma_dict, path2lemmadict)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
printlog("Build and save Wordlist for Spellchecking")
|
|
WORDS = Counter(words(open(path2words).read()))
|
|
save_obj(WORDS, path2wordlist)
|
|
|
|
|
|
|
|
|
|
|
|
printlog("Build and save Thesaurus")
|
|
THESAURUS = build_thesaurus(path2lexicalentries=lexicalentries)
|
|
|
|
|
|
save_obj(THESAURUS, path2thesaurusdict)
|
|
|
|
|
|
|
|
|
|
|
|
printlog("Build and save stoppwortliste")
|
|
de_stop_words = create_stopwordlist()
|
|
save_obj(de_stop_words, path2stopwordlist)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
printlog("Build and save nomenliste")
|
|
NOUNS = list(textacy.fileio.read_file_lines(
|
|
"/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen2.txt")) + list(
|
|
textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen.txt"))
|
|
NOUNS = list(map(textacy.preprocess.normalize_whitespace, NOUNS))
|
|
save_obj(NOUNS, path2NOUNSlist)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
printlog("Build and save fistnameslist")
|
|
VORNAMEN = list(map(textacy.preprocess.normalize_whitespace, textacy.fileio.read_file_lines(
|
|
"/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames.txt")))
|
|
|
|
save_obj(VORNAMEN, path2firstnameslist)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
end = time.time()
|
|
printlog("Time Elapsed Preprocessing:{0} min".format((end - start) / 60))
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |