aufgeräumt

This commit is contained in:
jannis.grundmann 2017-10-16 14:01:38 +02:00
parent 4fe12679fb
commit 56c8bce2d7
15 changed files with 944 additions and 1023 deletions

View File

@ -1,21 +1,67 @@
[filepath]
thesauruspath = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/openthesaurus.csv
path2xml = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/ticketSamples.xml
path2csv = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_2017-09-13.csv
small = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv
[thesaurus]
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/deWordNet.xml
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_dict.pkl
logfile = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log
[spellchecking]
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_dict.pkl
lemmas = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmatization-de.txt
[lemmatization]
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmas.txt
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict.pkl
[nouns]
input1 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen.txt
input2 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen2.txt
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list.pkl
[firstnames]
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames.txt
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list.pkl
[de_stopwords]
input1 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_1.txt
input2 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_2.txt
input3 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_3.txt
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list.pkl
[logging]
level = INFO
filename = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log
[de_corpus]
#input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv
#input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv
#input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_mini.csv
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv
path = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/
raw = de_raw_ticket
pre = de_pre_ticket
[en_corpus]
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/en_tickets.csv
path = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/
raw = en_raw_ticket
pre = en_pre_ticket
[tickets]
content_collumn_name = Description
metaliste = TicketNumber,Subject,CreatedDate,categoryName,Impact,Urgency,BenutzerID,VerantwortlicherID,EigentuemerID,Solution
language = de
[preprocessing]

View File

@ -1,6 +1,35 @@
# -*- coding: utf-8 -*-
from datetime import datetime
import time
import logging
from stop_words import get_stop_words
#import words as words
from nltk.corpus import stopwords as nltk_stopwords
from collections import Counter
import csv
import re
import xml.etree.ElementTree as ET
import spacy
import textacy
from scipy import *
import sys
csv.field_size_limit(sys.maxsize)
import pickle
import configparser as ConfigParser
from miscellaneous import *
import time
from datetime import datetime
@ -17,87 +46,15 @@ import sys
csv.field_size_limit(sys.maxsize)
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corporization.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_corporization.log &"
path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv"
path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv"
#path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_mini.csv"
path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv"
path2en_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/en_tickets.csv"
content_collumn_name = "Description"
metaliste = [
"TicketNumber",
"Subject",
"CreatedDate",
"categoryName",
"Impact",
"Urgency",
"BenutzerID",
"VerantwortlicherID",
"EigentuemerID",
"Solution"
]
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
corpus_name = "de_raw_ticketCorpus"
logfile = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log"
# todo configuration file
"""
# load config
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
config = ConfigParser.ConfigParser()
with open(config_ini) as f:
config.read_file(f)
"""
# config logging
logging.basicConfig(filename=logfile, level=logging.INFO)
# logging.basicConfig(filename=config.get("filepath","logfile"), level=logging.INFO)
def printlog(string, level="INFO"):
"""log and prints"""
print(string)
if level == "INFO":
logging.info(string)
elif level == "DEBUG":
logging.debug(string)
elif level == "WARNING":
logging.warning(string)
def printRandomDoc(textacyCorpus):
import random
print()
printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
randIndex = int((len(textacyCorpus) - 1) * random.random())
printlog("Index: {0} ; Text: {1} ; Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text,
textacyCorpus[randIndex].metadata))
print()
def ticketcsv_to_textStream(path2csv: str, content_collumn_name: str):
@ -146,75 +103,93 @@ def ticket_csv_to_DictStream(path2csv: str, metalist: [str]):
yield metadata
def save_corpus(corpus, corpus_path, corpus_name, parser):
"""
# save stringstore
stringstore_path = corpus_path + corpus_name + '_strings.json'
with open(stringstore_path, "w") as file:
parser.vocab.strings.dump(file)
#todo save vocab?
"""
# save parser
parserpath = corpus_path + str(parser.lang) + '_parser'
parser.save_to_directory(parserpath)
# save content
contentpath = corpus_path + corpus_name + "_content.bin"
textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus), contentpath)
# save meta
metapath = corpus_path + corpus_name + "_meta.json"
textacy.fileio.write_json_lines((doc.metadata for doc in corpus), metapath)
##################################################################################################
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corporization.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_corporization.log &"
"""
content_collumn_name = "Description"
metaliste = [
"TicketNumber",
"Subject",
"CreatedDate",
"categoryName",
"Impact",
"Urgency",
"BenutzerID",
"VerantwortlicherID",
"EigentuemerID",
"Solution"
]
"""
content_collumn_name = config.get("tickets","content_collumn_name")
metaliste = config.get("tickets","metaliste")
path2de_csv = config.get("de_corpus","input")
corpus_de_path = config.get("de_corpus", "path")
raw_de_name = config.get("de_corpus", "raw")
path2en_csv = config.get("en_corpus","input")
corpus_en_path = config.get("en_corpus", "path")
raw_en_name = config.get("en_corpus", "raw")
def main():
start = time.time()
printlog("Corporization: {0}".format(datetime.now()))
#print paths
path_csv_split = path2de_csv.split("/")
printlog(path_csv_split[len(path_csv_split) - 1])
path_csv_split = path2en_csv.split("/")
printlog(path_csv_split[len(path_csv_split) - 1])
start = time.time()
DE_PARSER = spacy.load("de")
EN_PARSER = spacy.load("en")
de_corpus = textacy.Corpus(DE_PARSER)
en_corpus = textacy.Corpus(EN_PARSER)
raw_de_corpus = textacy.Corpus(DE_PARSER)
raw_en_corpus = textacy.Corpus(EN_PARSER)
## add files to textacy-corpus,
printlog("Add texts to textacy-corpus")
## add files to textacy-corpi,
printlog("Add texts to textacy-corpi")
de_corpus.add_texts(
raw_de_corpus.add_texts(
ticketcsv_to_textStream(path2de_csv, content_collumn_name),
ticket_csv_to_DictStream(path2de_csv, metaliste)
)
# leere docs aus corpus kicken
de_corpus.remove(lambda doc: len(doc) == 0)
raw_en_corpus.add_texts(
ticketcsv_to_textStream(path2en_csv, content_collumn_name),
ticket_csv_to_DictStream(path2en_csv, metaliste)
)
for i in range(20):
printRandomDoc(de_corpus)
# leere docs aus corpi kicken
raw_de_corpus.remove(lambda doc: len(doc) == 0)
raw_en_corpus.remove(lambda doc: len(doc) == 0)
#save corpus
#for i in range(20):
# printRandomDoc(raw_de_corpus)
# printRandomDoc(raw_en_corpus)
save_corpus(corpus=de_corpus,corpus_path=corpus_path,corpus_name=corpus_name,parser=DE_PARSER)
#todo das selbe mit en_corpus
#save corpi
save_corpus(corpus=raw_de_corpus, corpus_path=corpus_de_path, corpus_name=raw_de_name)
save_corpus(corpus=raw_en_corpus, corpus_path=corpus_en_path, corpus_name=raw_en_name)

Binary file not shown.

View File

@ -1,9 +0,0 @@
{"categoryName":"zhb","Subject":"schulungstest","Solution":""}
{"categoryName":"neuanschluss","Subject":"telephone contract","Solution":"subject"}
{"categoryName":"zhb","Subject":"schulungstest","Solution":""}
{"categoryName":"neuanschluss","Subject":"telephone contract","Solution":"frau hinrichs überdenkt die situation und macht dann neue anträge . dieses ticket wird geschlossen"}
{"categoryName":"neuanschluss","Subject":"telephone contract","Solution":"faxnummer 3166 wurde unter die telefonnummer 7179 im elektronischen telefonbuch eingetragen"}
{"categoryName":"lan","Subject":"defekte netzwerkdose frage zu vpn","Solution":"hallo herr rauner , die netzwerkdose weist z. z. keine verbindungsprobleme auf . falls doch welche bestehen , melden sie sich bitte bei uns . mit freunldichen grüßen aicha oikrim"}
{"categoryName":"betrieb","Subject":"sso login via browser mit zertifikat","Solution":"der login via zertifikat am sso - dienst mittels firefox und unicard sollte funktionieren . eventuell wurden durch ein browserupdate die einstellungen gelöscht . bitte prüfen sie ob die ca - zertifikate installiert sind : https://pki.pca.dfn.de/tu-dortmund-chipcard-ca/cgi-bin/pub/pki?cmd=getstaticpage;name=index;id=2&ra_id=0 \" https://pki.pca.dfn.de/tu-dortmund-chipcard-ca/cgi-bin/pub/pki?cmd=getstaticpage;name=index;id=2&ra_id=0 \" und ob das kryptographie modul im firefox hinterlegt ist : https://service.tu-dortmund.de/group/intra/authentifizierung"}
{"categoryName":"elektronisches telefonbuch","Subject":"telephone contract","Solution":"erledigt"}
{"categoryName":"verwaltung","Subject":"laptop macht komische geräusche","Solution":"herr alexev swetlomier ( hiwi ) küümert sich bereits um das laptop und frau herbst weiß auch bescheid die zur zeit im urlaub ist"}

File diff suppressed because one or more lines are too long

351
init.py
View File

@ -4,6 +4,9 @@ from datetime import datetime
import time
import logging
from stop_words import get_stop_words
#import words as words
from nltk.corpus import stopwords as nltk_stopwords
from collections import Counter
import csv
@ -15,58 +18,35 @@ from scipy import *
import sys
csv.field_size_limit(sys.maxsize)
import pickle
import configparser as ConfigParser
from miscellaneous import *
# todo configuration file ?
"""
# load config
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
config = ConfigParser.ConfigParser()
with open(config_ini) as f:
config.read_file(f)
"""
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/init.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_init.log &"
# config logging
logfile = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log"
logging.basicConfig(filename=logfile, level=logging.INFO)
def create_lemma_dict(path2lemmalist):
"""
Creates a dict out of a file a la:
DE_PARSER = spacy.load("de")
EN_PARSER = spacy.load("en")
l1 w1
l1 w2
l2 w1
l2 w2
Result will be used as lemma_dict["word"] --> lemma
def replaceRockDots():
return lambda string: re.sub(r'[ß]', "ss",
(re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower())))))))
def printlog(string, level="INFO"):
"""log and prints"""
print(string)
if level == "INFO":
logging.info(string)
elif level == "DEBUG":
logging.debug(string)
elif level == "WARNING":
logging.warning(string)
def save_obj(obj, path):
with open(path + '.pkl', 'wb') as f:
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
def load_obj(path ):
with open(path + '.pkl', 'rb') as f:
return pickle.load(f)
def create_lemma_dict(lemmalist):
:param path2lemmalist: str
:return: dictionary
"""
lemmalist = list(map(textacy.preprocess.normalize_whitespace, list(
textacy.fileio.read_file_lines(path2lemmalist))))
lemma_dict = {}
@ -81,69 +61,22 @@ def create_lemma_dict(lemmalist):
return lemma_dict
"""
def build_thesaurus(path2lexicalentries, path2synsets):
lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
def build_thesaurus_dict(path2wordnet,returnall=False):
"""
Creates a dict out of the deWordNet
https://raw.githubusercontent.com/hdaSprachtechnologie/odenet/master/deWordNet.xml
Result will be used as lemma_dict["word"] --> lemma
:param path2lexicalentries: str
:param returnall: bool if True, also return , word2synsets, synset2Words
:return: dictionaries: thesaurus
"""
lextree = ET.parse(path2wordnet, ET.XMLParser(encoding="utf-8"))
lexroot = lextree.getroot()
synroot = syntree.getroot()
thesaurus = []
for r in synroot:
for element in r:
if element.tag == "Synset":
sysnet = []
attrib = element.attrib
id = attrib["id"]
for ro in lexroot:
for elem in ro:
if elem.tag == "LexicalEntry":
subs_dicts = [subentry.attrib for subentry in elem]
# <class 'list'>: [{'partOfSpeech': 'n', 'writtenForm': 'Kernspaltung'}, {'synset': 'de-1-n', 'id': 'w1_1-n'}]
dic = {k: v for x in subs_dicts for k, v in x.items()} # to one dict
if "synset" in dic.keys():
if dic["synset"] == id:
string = (dic["writtenForm"])
# replaceRockDots
string = re.sub(r'[ß]', "ss", string)
string = re.sub(r'[ö]', "oe", string)
string = re.sub(r'[ü]', "ue", string)
string = re.sub(r'[ä]', "ae", string)
# alle punkte raus
string = re.sub(r'[.]', "", string)
# alles in klammern raus
string = re.sub(r"\((.*)\)", " ", string)
# längeres leerzeichen normalisieren
string = textacy.preprocess.normalize_whitespace(string)
sysnet.append(string.lower().strip())
# nach anzhal der wörter in den strings sortieren
sysnet.sort(key=lambda x: len(x.split()))
if len(sysnet) != 0:
# todo warum sind manche leer?
thesaurus.append(sysnet)
return thesaurus
#todo thesaurus in dictionary
"""
def build_thesaurus(path2lexicalentries):#, path2synsets):
lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
#syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
lexroot = lextree.getroot()
#synroot = syntree.getroot()
word2synsets = {}
template = {"w1": ["s1", "s2"]}
@ -167,6 +100,9 @@ def build_thesaurus(path2lexicalentries):#, path2synsets):
if 'writtenForm' in lex_dict.keys():
string = (lex_dict["writtenForm"])
if string == "Kennwort":
pass
# replaceRockDots
string = re.sub(r'[ß]', "ss", string)
string = re.sub(r'[ö]', "oe", string)
@ -186,15 +122,17 @@ def build_thesaurus(path2lexicalentries):#, path2synsets):
word2synsets[string] = synlist
synset2Words = {}
template = {"s1": ["w1","w2"]}
for word,synset in word2synsets.items():
for syn in synset:
if syn not in synset2Words.keys():
synset2Words[syn] = [word]
else:
synset2Words[syn].append(word)
if word != '':
for syn in synset:
if syn not in synset2Words.keys():
synset2Words[syn] = [word]
else:
synset2Words[syn].append(word)
# nach anzhal der wörter in den strings sortieren
for synset in word2synsets.values():
@ -203,91 +141,135 @@ def build_thesaurus(path2lexicalentries):#, path2synsets):
thesaurus = {}
thesaurus_template = {"w1" : "mainsyn"}
for word,synset in word2synsets.items():
try:
thesaurus[word] = synset2Words[synset[0]][0] #Ann.: erstes synonym ist das Hauptsynonym
thesaurus[word] = synset2Words[synset[0]][0] #Ann.: erstes synonym ist das Hauptsynonym #todo nach (hauptform) suchen?
except:
pass
return thesaurus
if returnall:
return thesaurus, word2synsets, synset2Words
else:
return thesaurus
def create_stopword_lists(*paths):
"""
for r in synroot:
for element in r:
creates a list of stoppwords from:
spacy
nltk
stop_words
if element.tag == "Synset":
synset = []
attrib = element.attrib
id = attrib["id"]
:param paths: list of additional filepaths where each file looks like
w1
w2
w3
filenames must be a la de_stopwords_1.txt, en_stopwords_2.txt
if id not in synset2Words.keys():
synset2Words[id] = "WORD"
:return: lists: de_stopwords, en_stopwords
"""
## GERMAN
# from packages
de_stop_words1 = list(get_stop_words("de"))
de_stop_words2 = list(nltk_stopwords.words('german'))
de_stop_words3 = list(__import__("spacy.de", globals(), locals(), ['object']).STOP_WORDS)
#from files
de_filepaths = []
for path in paths:
if os.path.basename(path).split("_")[0] == 'de' and os.path.basename(path).split("_")[
1] == 'stopwords':
de_filepaths.append(path)
def create_stopwordlist():
de_stop_words4 = list_from_files(*de_filepaths)
de_stop_words1 = list(map(replaceRockDots(),
list(
map(textacy.preprocess.normalize_whitespace,
textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stop_words.txt")
)
)
)
)
de_stop_words2 = list(map(replaceRockDots(),list(set(nltk_stopwords.words('german')))))
de_stop_words3 = list(map(replaceRockDots(),list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)))
de_stop_words4 = list(map(replaceRockDots(),list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords-de.txt"))))
de_stop_words = list(set(de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4))
return de_stop_words
#todo en_stop_words= set(list(__import__("spacy." + EN_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)+ list(set(nltk_stopwords.words('english'))))
#combine everything
de_stop_words = list(set(map(replaceRockDots(), list(map(textacy.preprocess.normalize_whitespace,
de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4)))))
## ENGLISH
# from packages
en_stop_words1 = list(get_stop_words("en"))
en_stop_words2 = list(nltk_stopwords.words('english'))
en_stop_words3 = list(__import__("spacy.en", globals(), locals(), ['object']).STOP_WORDS)
# from files
en_filepaths = [path for path in paths if
os.path.basename(path).split("_")[0] == 'en' and os.path.basename(path).split("_")[
1] == 'stopwords']
en_stop_words4 = list_from_files(*en_filepaths)
########################## Spellchecking ##########################################
# http://norvig.com/spell-correct.html
# http://wortschatz.uni-leipzig.de/en/download
# combine everything
en_stop_words = list(set(map(replaceRockDots(), list(map(textacy.preprocess.normalize_whitespace,
en_stop_words1 + en_stop_words2 + en_stop_words3 + en_stop_words4)))))
return de_stop_words, en_stop_words
def build_words_for_spellchecking(path2words):
"""
create word-Counter for spellchecking
http://norvig.com/spell-correct.html
http://wortschatz.uni-leipzig.de/en/download
http://pcai056.informatik.uni-leipzig.de/downloads/corpora/deu_news_2015_1M.tar.gz
:return: Counter
"""
def words(text): return re.findall(r'\w+', text.lower())
return Counter(words(open(path2words).read()))
def words(text): return re.findall(r'\w+', text.lower())
##################################################################################################
# ziel: dictionaries für thesaurus, correctwordliste und lemmas als ladbare dateien
# außerdem saubere stoppwortliste und nomenliste
# THESAURUS
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
#synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
path2thesaurusdict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_list"
path2wordnet = config.get("thesaurus","input")
path2thesaurus_dict = config.get("thesaurus","pickle_file")
# SPELLCHECKING
path2words = '/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt'
path2wordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_list"
path2words_file = config.get("spellchecking","input")
path2wordlist = config.get("spellchecking","pickle_file")
# LEMMA
path2lemma_file = config.get("lemmatization","input")
path2lemmadict = config.get("lemmatization","pickle_file")
# NOMEN
nouns1 = config.get("nouns","input1")
nouns2 = config.get("nouns","input2")
path2nouns_list = config.get("nouns","pickle_file")
path2lemmadict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict"
path2stopwordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list"
path2NOUNSlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list"
path2firstnameslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list"
# VORNAMEN
firstnames_txt = config.get("firstnames","input")
path2firstnameslist = config.get("firstnames","pickle_file")
# STOPWORDS
stop1 = config.get("de_stopwords","input1")
stop2 = config.get("de_stopwords","input2")
stop3 = config.get("de_stopwords","input3")
path2stopwordlist = config.get("de_stopwords","pickle_file")
@ -297,71 +279,42 @@ def main():
printlog("create and save lemma_dict")
LEMMAS = list(
textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmas.txt"))
lemma_dict = create_lemma_dict(LEMMAS)
lemma_dict = create_lemma_dict(path2lemma_file)
save_obj(lemma_dict, path2lemmadict)
printlog("Build and save Wordlist for Spellchecking")
WORDS = Counter(words(open(path2words).read()))
save_obj(WORDS, path2wordlist)
words = build_words_for_spellchecking(path2words_file)
save_obj(words, path2wordlist)
printlog("Build and save Thesaurus")
THESAURUS = build_thesaurus(path2lexicalentries=lexicalentries)
save_obj(THESAURUS, path2thesaurusdict)
thesaurus = build_thesaurus_dict(path2wordnet)
save_obj(thesaurus, path2thesaurus_dict)
printlog("Build and save stoppwortliste")
de_stop_words = create_stopwordlist()
de_stop_words = create_stopword_lists(stop1, stop2, stop3)
save_obj(de_stop_words, path2stopwordlist)
printlog("Build and save nomenliste")
NOUNS = list(textacy.fileio.read_file_lines(
"/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen2.txt")) + list(
textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen.txt"))
NOUNS = list(map(textacy.preprocess.normalize_whitespace, NOUNS))
save_obj(NOUNS, path2NOUNSlist)
printlog("Build and save fistnameslist")
VORNAMEN = list(map(textacy.preprocess.normalize_whitespace, textacy.fileio.read_file_lines(
"/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames.txt")))
save_obj(VORNAMEN, path2firstnameslist)
nouns = list_from_files(nouns1,nouns2)
save_obj(nouns, path2nouns_list)
printlog("Build and save firstnameslist")
vornamen = list_from_files(firstnames_txt)
save_obj(vornamen, path2firstnameslist)
end = time.time()
printlog("Time Elapsed Preprocessing:{0} min".format((end - start) / 60))
printlog("Time Elapsed Initialization:{0} min".format((end - start) / 60))

21
main.py Normal file
View File

@ -0,0 +1,21 @@
# -*- coding: utf-8 -*-
import init
import corporization
import preprocessing
from miscellaneous import *
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/main.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_main.log &"
init.main()
printlog("")
corporization.main()
printlog("")
preprocessing.main()
printlog("")

281
miscellaneous.py Normal file
View File

@ -0,0 +1,281 @@
# -*- coding: utf-8 -*-
import random
import time
from pathlib import Path
from datetime import datetime
import logging
from nltk.corpus import stopwords
import csv
import functools
import re
import xml.etree.ElementTree as ET
import spacy
import textacy
from scipy import *
import sys
from datetime import datetime
import time
start = time.time()
import logging
from nltk.corpus import stopwords
import csv
import functools
import re
import xml.etree.ElementTree as ET
import spacy
import textacy
from scipy import *
import sys
csv.field_size_limit(sys.maxsize)
import time
import enchant
start = time.time()
import logging
import csv
import functools
import os.path
import re
import subprocess
import time
import xml.etree.ElementTree as ET
import sys
import spacy
import textacy
from scipy import *
from textacy import Vectorizer
import warnings
import configparser as ConfigParser
import sys
import hunspell
from postal.parser import parse_address
from datetime import datetime
import time
import logging
from nltk.corpus import stopwords as nltk_stopwords
from collections import Counter
import csv
import re
import xml.etree.ElementTree as ET
import spacy
import textacy
from scipy import *
import sys
csv.field_size_limit(sys.maxsize)
import pickle
# load config
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
config = ConfigParser.ConfigParser()
with open(config_ini) as f:
config.read_file(f)
# config logging
filename = config.get("logging","filename")
level = config.get("logging","level")
if level == "INFO":
level = logging.INFO
elif level == "DEBUG":
level = logging.DEBUG
elif level == "WARNING":
level = logging.WARNING
logging.basicConfig(filename=filename, level=level)
def printlog(string, level="INFO"):
"""log and prints"""
print(string)
if level == "INFO":
logging.info(string)
elif level == "DEBUG":
logging.debug(string)
elif level == "WARNING":
logging.warning(string)
def compose(*functions):
def compose2(f, g):
return lambda x: f(g(x))
return functools.reduce(compose2, functions, lambda x: x)
def get_calling_function():
"""finds the calling function in many decent cases.
https://stackoverflow.com/questions/39078467/python-how-to-get-the-calling-function-not-just-its-name
"""
fr = sys._getframe(1) # inspect.stack()[1][0]
co = fr.f_code
for get in (
lambda: fr.f_globals[co.co_name],
lambda: getattr(fr.f_locals['self'], co.co_name),
lambda: getattr(fr.f_locals['cls'], co.co_name),
lambda: fr.f_back.f_locals[co.co_name], # nested
lambda: fr.f_back.f_locals['func'], # decorators
lambda: fr.f_back.f_locals['meth'],
lambda: fr.f_back.f_locals['f'],
):
try:
func = get()
except (KeyError, AttributeError):
pass
else:
if func.__code__ == co:
return func
raise AttributeError("func not found")
def save_obj(obj, path):
with open(path , 'wb') as f:
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
def load_obj(path):
with open(path, 'rb') as f:
return pickle.load(f)
def replaceRockDots():
return lambda string: re.sub(r'[ß]', "ss",
(re.sub(r'[ö]', "oe",
(re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower())))))))
def list_from_files(*paths):
"""
create string-list from file like
n1
n2
n3
:param paths: list(str) or str if single path
:return: list(str)
"""
listlist = []
for path in paths:
listlist.append(list(textacy.fileio.read_file_lines(path)))
#liste von listen zu einer liste
liste = [item for sublist in listlist for item in sublist]
return list(map(textacy.preprocess.normalize_whitespace, liste))
def printRandomDoc(textacyCorpus):
"""
printlogss random doc out of a textacy-Corpus
:param textacyCorpus:
"""
print()
printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
randIndex = int((len(textacyCorpus) - 1) * random.random())
printlog("Index: {0} ; Text: {1} ; Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text,
textacyCorpus[randIndex].metadata))
print()
def save_corpus(corpus, corpus_path, corpus_name):
"""
saves a textacy-corpus including spacy-parser
:param corpus: textacy-Corpus
:param corpus_path: str
:param corpus_name: str (should content the language like "_de_")
"""
"""
# save stringstore
stringstore_path = corpus_path + corpus_name + '_strings.json'
with open(stringstore_path, "w") as file:
parser.vocab.strings.dump(file)
#todo save vocab?
"""
# save parser
parser = corpus.spacy_lang
parserpath = corpus_path + str(parser.lang) + '_parser'
parser.save_to_directory(parserpath)
# save content
contentpath = corpus_path + corpus_name + "_content.bin"
textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus), contentpath)
# save meta
metapath = corpus_path + corpus_name + "_meta.json"
textacy.fileio.write_json_lines((doc.metadata for doc in corpus), metapath)
def load_corpus(corpus_path, corpus_name, lang="de"):
"""
Load textacy-Corpus including spacy-parser out from file
:param corpus_path: str
:param corpus_name: str (should content the language like "_de_")
:param lang: str language code)
:return: texracy.Corpus, spacy.language
"""
#ckeck for language
if "_de_" in corpus_name:
lang="de"
elif "_en_" in corpus_name:
lang ="en"
# load parser
parser = spacy.load(lang)
stringstorepath = corpus_path + str(lang) + '_parser'+'/vocab/strings.json'
with open(stringstorepath) as file:
parser.vocab.strings.load(file)
vocabpath = Path(corpus_path + str(lang) + '_parser'+'/vocab/lexemes.bin')
parser.vocab.load_lexemes(vocabpath)
#load corpus
corpus = textacy.Corpus(parser)
contentpath = corpus_path + corpus_name + "_content.bin"
metapath = corpus_path + corpus_name + "_meta.json"
metadata_stream = textacy.fileio.read_json_lines(metapath)
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
corpus.add_doc(
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
return corpus, corpus.spacy_lang

View File

@ -420,10 +420,10 @@ custom_words = ["grüßen", "fragen"]
####################'####################'####################'####################'####################'##############
## files to textacy-corpus
## files to textacy-corpi
textacyCorpus = textacy.Corpus(PARSER)
print("add texts to textacy-corpus...")
print("add texts to textacy-corpi...")
textacyCorpus.add_texts(texts=generateTextfromTicketXML(DATAPATH, normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize), metadatas=generateMetadatafromTicketXML(DATAPATH))

View File

@ -182,8 +182,8 @@ cleanStream = compose(
cleanEnt
)
"""
# content: xml -> stringCleaning -> pipe -> docCleaning -> corpus
# metadata:xml -> -> stringCleaning -> corpus
# content: xml -> stringCleaning -> pipe -> docCleaning -> corpi
# metadata:xml -> -> stringCleaning -> corpi
corpus = textacy.Corpus(PARSER)

View File

@ -2,27 +2,53 @@
from datetime import datetime
print(datetime.now())
from datetime import datetime
import time
import logging
from stop_words import get_stop_words
#import words as words
from nltk.corpus import stopwords as nltk_stopwords
from collections import Counter
import csv
import re
import xml.etree.ElementTree as ET
import spacy
import textacy
from scipy import *
import sys
csv.field_size_limit(sys.maxsize)
import pickle
import configparser as ConfigParser
from miscellaneous import *
path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv"
path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv"
#path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_mini.csv"
path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv"
path2en_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/en_tickets.csv"
#idee roh-corpus (nur whitespace weg) speichern -> pregeprocesster corpus -> damit arbeiten
path_csv_split = path2de_csv.split("/")
print(path_csv_split[len(path_csv_split) - 1])
path_csv_split = path2en_csv.split("/")
print(path_csv_split[len(path_csv_split) - 1])
import time
start = time.time()
from datetime import datetime
import logging
from nltk.corpus import stopwords
import csv
import functools
import re
import xml.etree.ElementTree as ET
import spacy
import textacy
from scipy import *
import sys
csv.field_size_limit(sys.maxsize)
import time
import logging
from nltk.corpus import stopwords
@ -40,231 +66,29 @@ csv.field_size_limit(sys.maxsize)
import pickle
def save_obj(obj, path):
with open(path + '.pkl', 'wb') as f:
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
def load_obj(path ):
with open(path + '.pkl', 'rb') as f:
return pickle.load(f)
def load_corpus(corpus_path, corpus_name, lang="de"):
contentpath = corpus_path + corpus_name + "_content.bin"
metapath = corpus_path + corpus_name + "_meta.json"
#load parser
parserpath = corpus_path + str(lang) + '_parser'
parser = spacy.load(parserpath)
corpus = textacy.Corpus(parser)
metadata_stream = textacy.fileio.read_json_lines(metapath)
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
corpus.add_doc(
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
return corpus
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
corpus_name = "de_raw_ticketCorpus"
print(load_corpus(corpus_path,corpus_name))
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/preprocessing.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_preprocessing.log &"
# todo configuration file ?
"""
# load config
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
config = ConfigParser.ConfigParser()
with open(config_ini) as f:
config.read_file(f)
"""
# config logging
logfile = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log"
logging.basicConfig(filename=logfile, level=logging.INFO)
# logging.basicConfig(filename=config.get("filepath","logfile"), level=logging.INFO)
# THESAURUS
path2thesaurusdict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_list"
THESAURUS = load_obj(path2thesaurusdict)
# SPELLCHECKING
path2wordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_list"
path2lemmadict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict"
path2stopwordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list"
path2NOUNSlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list"
path2firstnameslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list"
# SPELLCHECKING
parser.save_to_directory(corpus_path + str(parser.lang) + '_parser')
DE_PARSER = spacy.load("de")
EN_PARSER = spacy.load("en")
de_stop_words = list(map(textacy.preprocess.normalize_whitespace, textacy.fileio.read_file_lines(
"/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stop_words.txt"))) + list(set(stopwords.words('german')))
en_stop_words= set(list(__import__("spacy." + EN_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)+ list(set(stopwords.words('english'))))
LEMMAS = list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmas.txt"))
VORNAMEN = list(map(textacy.preprocess.normalize_whitespace, textacy.fileio.read_file_lines(
"/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames.txt")))
NOUNS = list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen2.txt")) + list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen.txt"))
NOUNS = list(map(textacy.preprocess.normalize_whitespace, NOUNS))
"""
print(de_stop_words[10:30])
print(LEMMAS[10:30])
print(VORNAMEN[10:30])
print(NOUNS[10:30])
"""
mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)
hardSFinder = re.compile(r'[ß]', re.IGNORECASE)
def printlog(string, level="INFO"):
"""log and prints"""
print(string)
if level == "INFO":
logging.info(string)
elif level == "DEBUG":
logging.debug(string)
elif level == "WARNING":
logging.warning(string)
printlog("Load functions")
def compose(*functions):
def compose2(f, g):
return lambda x: f(g(x))
return functools.reduce(compose2, functions, lambda x: x)
def get_calling_function():
"""finds the calling function in many decent cases.
https://stackoverflow.com/questions/39078467/python-how-to-get-the-calling-function-not-just-its-name
"""
fr = sys._getframe(1) # inspect.stack()[1][0]
co = fr.f_code
for get in (
lambda: fr.f_globals[co.co_name],
lambda: getattr(fr.f_locals['self'], co.co_name),
lambda: getattr(fr.f_locals['cls'], co.co_name),
lambda: fr.f_back.f_locals[co.co_name], # nested
lambda: fr.f_back.f_locals['func'], # decorators
lambda: fr.f_back.f_locals['meth'],
lambda: fr.f_back.f_locals['f'],
):
try:
func = get()
except (KeyError, AttributeError):
pass
else:
if func.__code__ == co:
return func
raise AttributeError("func not found")
def printRandomDoc(textacyCorpus):
import random
print()
printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
randIndex = int((len(textacyCorpus) - 1) * random.random())
printlog("Index: {0} ; Text: {1} ; Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text,
textacyCorpus[randIndex].metadata))
print()
def csv_to_contentStream(path2csv: str, content_collumn_name: str):
"""
:param path2csv: string
:param content_collumn_name: string
:return: string-generator
"""
stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8')
content_collumn = 0 # standardvalue
for i, lst in enumerate(stream):
if i == 0:
# look for desired column
for j, col in enumerate(lst):
if col == content_collumn_name:
content_collumn = j
else:
yield lst[content_collumn]
def csv_to_metaStream(path2csv: str, metalist: [str]):
"""
:param path2csv: string
:param metalist: list of strings
:return: dict-generator
"""
stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8')
content_collumn = 0 # standardvalue
metaindices = []
metadata_temp = {}
for i, lst in enumerate(stream):
if i == 0:
for j, col in enumerate(lst): # geht bestimmt effizienter... egal, weil passiert nur einmal
for key in metalist:
if key == col:
metaindices.append(j)
metadata_temp = dict(
zip(metalist, metaindices)) # zB {'Subject' : 1, 'categoryName' : 3, 'Solution' : 10}
else:
metadata = metadata_temp.copy()
for key, value in metadata.items():
metadata[key] = lst[value]
yield metadata
REGEX_SPECIALCHAR = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|</>?]'
REGEX_TOPLVL = r'\.[a-z]{2,3}(\.[a-z]{2,3})?'
THESAURUS = {}
WORDS = {}
LEMMAS = {}
NOUNS = []
VORNAMEN= []
de_stop_words=[]
############# filter tokens
@ -303,14 +127,12 @@ def remove_words_containing_Numbers():
return lambda tok: not bool(re.search('\d', tok.lower_))
"""
def remove_words_containing_topLVL():
return lambda tok: not bool(re.search(regex_topLvl, tok.lower_))
return lambda tok: not bool(re.se