refactoring
This commit is contained in:
parent
b542c4285a
commit
43955a17f2
Binary file not shown.
|
@ -0,0 +1,9 @@
|
|||
{"categoryName":"zhb","Subject":"schulungstest","Solution":""}
|
||||
{"categoryName":"neuanschluss","Subject":"telephone contract","Solution":"subject"}
|
||||
{"categoryName":"zhb","Subject":"schulungstest","Solution":""}
|
||||
{"categoryName":"neuanschluss","Subject":"telephone contract","Solution":"frau hinrichs überdenkt die situation und macht dann neue anträge . dieses ticket wird geschlossen"}
|
||||
{"categoryName":"neuanschluss","Subject":"telephone contract","Solution":"faxnummer 3166 wurde unter die telefonnummer 7179 im elektronischen telefonbuch eingetragen"}
|
||||
{"categoryName":"lan","Subject":"defekte netzwerkdose frage zu vpn","Solution":"hallo herr rauner , die netzwerkdose weist z. z. keine verbindungsprobleme auf . falls doch welche bestehen , melden sie sich bitte bei uns . mit freunldichen grüßen aicha oikrim"}
|
||||
{"categoryName":"betrieb","Subject":"sso login via browser mit zertifikat","Solution":"der login via zertifikat am sso - dienst mittels firefox und unicard sollte funktionieren . eventuell wurden durch ein browserupdate die einstellungen gelöscht . bitte prüfen sie ob die ca - zertifikate installiert sind : https://pki.pca.dfn.de/tu-dortmund-chipcard-ca/cgi-bin/pub/pki?cmd=getstaticpage;name=index;id=2&ra_id=0 \" https://pki.pca.dfn.de/tu-dortmund-chipcard-ca/cgi-bin/pub/pki?cmd=getstaticpage;name=index;id=2&ra_id=0 \" und ob das kryptographie modul im firefox hinterlegt ist : https://service.tu-dortmund.de/group/intra/authentifizierung"}
|
||||
{"categoryName":"elektronisches telefonbuch","Subject":"telephone contract","Solution":"erledigt"}
|
||||
{"categoryName":"verwaltung","Subject":"laptop macht komische geräusche","Solution":"herr alexev swetlomier ( hiwi ) küümert sich bereits um das laptop und frau herbst weiß auch bescheid die zur zeit im urlaub ist"}
|
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
1069
preprocessing.py
1069
preprocessing.py
File diff suppressed because it is too large
Load Diff
85
testra.py
85
testra.py
|
@ -16,7 +16,7 @@ import xml.etree.ElementTree as ET
|
|||
|
||||
print(datetime.now())
|
||||
|
||||
|
||||
"""
|
||||
PARSER=spacy.load("de")
|
||||
|
||||
|
||||
|
@ -51,12 +51,32 @@ corpus.add_texts(
|
|||
)
|
||||
|
||||
print(corpus)
|
||||
"""
|
||||
|
||||
|
||||
from postal.parser import parse_address
|
||||
|
||||
|
||||
address = "Nicolas Rauner LS Biomaterialien und Polymerwissenschaften Fakultät Bio- und Chemieingenieurwesen TU Dortmund D-44227 Dortmund Tel: + 49-(0)231 / 755 - 3015 Fax: + 49-(0)231 / 755 - 2480"
|
||||
print(parse_address(address))
|
||||
|
||||
|
||||
address = "Technische Universität Dortmund Maschinenbau/Lehrstuhl für Förder- und Lagerwesen LogistikCampus Joseph-von-Fraunhofer-Str. 2-4 D-44227 Dortmund "
|
||||
print(parse_address(address))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
|
||||
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
|
||||
corpus_name = "testcorpus"
|
||||
|
||||
"""
|
||||
|
||||
#corpus.save(corpus_path, name=corpus_name, compression=corpus_compression)
|
||||
#corpus = textacy.Corpus.load(corpus_path, name=corpus_name, compression=corpus_compression)
|
||||
|
||||
|
@ -69,7 +89,7 @@ path_lexemes_bin_ = pathlib.Path(corpus_path + 'lexemes.bin')
|
|||
|
||||
PARSER.vocab.dump(path_lexemes_bin_)
|
||||
nlp.vocab.load_lexemes(path_lexemes_bin_)
|
||||
"""
|
||||
|
||||
|
||||
def save_corpus(corpus_path,corpus_name):
|
||||
|
||||
|
@ -121,13 +141,7 @@ save_corpus(corpus_path,corpus_name)
|
|||
|
||||
print(load_corpus(corpus_path,corpus_name))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#idee das auch mit spellchecker, lemmetaizer und thesaurus machen wegen memory
|
||||
# todo generators immer neu laden? wegen laufzeit-nacheinander-picking, denn sonst nicht det
|
||||
|
||||
"""
|
||||
|
||||
"""
|
||||
def normalizeSynonyms(default_return_first_Syn=False, parser=PARSER):
|
||||
|
@ -171,11 +185,6 @@ def getHauptform(syn_block, word, default_return_first_Syn=False):
|
|||
return word # zur Not, das ursrpüngliche Wort zurückgeben
|
||||
"""
|
||||
|
||||
### extract from deWordNet.xml
|
||||
|
||||
#https://github.com/hdaSprachtechnologie/odenet
|
||||
|
||||
|
||||
"""
|
||||
path2xml="/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deWordNet.xml"
|
||||
|
||||
|
@ -203,10 +212,6 @@ for r in root:
|
|||
nomen.append(string.lower().strip())
|
||||
"""
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
import re
|
||||
from collections import Counter
|
||||
|
@ -247,30 +252,6 @@ def edits2(word):
|
|||
|
||||
"""
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
### extract from derewo
|
||||
|
||||
|
@ -297,20 +278,6 @@ for line in raw:
|
|||
textacy.fileio.write_file_lines(nomen,"nomen2.txt")
|
||||
"""
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
stream = textacy.fileio.read_csv("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_2017-09-13.csv", delimiter=";")
|
||||
content_collumn_name = "Description"
|
||||
|
@ -349,8 +316,6 @@ textacy.fileio.write_csv(misc_tickets,"M42-Export/misc_tickets.csv", delimiter="
|
|||
|
||||
"""
|
||||
|
||||
|
||||
|
||||
"""
|
||||
regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|</>?]'
|
||||
|
||||
|
@ -395,8 +360,6 @@ for s in stringcleaning((w for w in words),[seperate_words_on_regex()]):
|
|||
#print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w))
|
||||
"""
|
||||
|
||||
|
||||
|
||||
"""
|
||||
def replaceRockDots():
|
||||
return lambda string: re.sub(r'[ß]', "ss", (re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower())))))))
|
||||
|
|
|
@ -0,0 +1,348 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
print(datetime.now())
|
||||
|
||||
import time
|
||||
|
||||
import enchant
|
||||
|
||||
start = time.time()
|
||||
|
||||
import logging
|
||||
|
||||
import csv
|
||||
import functools
|
||||
import os.path
|
||||
import re
|
||||
import subprocess
|
||||
import time
|
||||
import xml.etree.ElementTree as ET
|
||||
import sys
|
||||
import spacy
|
||||
import textacy
|
||||
from scipy import *
|
||||
from textacy import Vectorizer
|
||||
import warnings
|
||||
import configparser as ConfigParser
|
||||
import sys
|
||||
import hunspell
|
||||
from postal.parser import parse_address
|
||||
|
||||
csv.field_size_limit(sys.maxsize)
|
||||
|
||||
|
||||
def printlog(string, level="INFO"):
|
||||
"""log and prints"""
|
||||
print(string)
|
||||
if level == "INFO":
|
||||
logging.info(string)
|
||||
elif level == "DEBUG":
|
||||
logging.debug(string)
|
||||
elif level == "WARNING":
|
||||
logging.warning(string)
|
||||
|
||||
|
||||
printlog("Load functions")
|
||||
|
||||
def printRandomDoc(textacyCorpus):
|
||||
import random
|
||||
print()
|
||||
|
||||
printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
|
||||
randIndex = int((len(textacyCorpus) - 1) * random.random())
|
||||
printlog("Index: {0} ; Text: {1} ; Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text,
|
||||
textacyCorpus[randIndex].metadata))
|
||||
|
||||
print()
|
||||
|
||||
|
||||
def load_corpus(corpus_path,corpus_name):
|
||||
# load new lang
|
||||
nlp = spacy.load("de")
|
||||
|
||||
#load stringstore
|
||||
stringstore_path = corpus_path + corpus_name + '_strings.json'
|
||||
with open(stringstore_path,"r") as file:
|
||||
nlp.vocab.strings.load(file)
|
||||
|
||||
# define corpus
|
||||
corpus = textacy.Corpus(nlp)
|
||||
|
||||
# load meta
|
||||
metapath = corpus_path + corpus_name +"_meta.json"
|
||||
metadata_stream = textacy.fileio.read_json_lines(metapath)
|
||||
|
||||
#load content
|
||||
contentpath = corpus_path + corpus_name+ "_content.bin"
|
||||
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
|
||||
|
||||
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
|
||||
corpus.add_doc(
|
||||
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
|
||||
|
||||
return corpus
|
||||
|
||||
|
||||
def printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_entities=True):
|
||||
printlog(str("ngrams: {0}".format(ngrams)))
|
||||
printlog(str("min_df: {0}".format(min_df)))
|
||||
printlog(str("max_df: {0}".format(max_df)))
|
||||
printlog(str("named_entities: {0}".format(named_entities)))
|
||||
|
||||
# printlog("vectorize corpus...")
|
||||
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
|
||||
|
||||
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in de_corpus)
|
||||
doc_term_matrix = vectorizer.fit_transform(terms_list)
|
||||
id2term = vectorizer.__getattribute__("id_to_term")
|
||||
|
||||
for t in terms_list:
|
||||
print(t)
|
||||
printlog("doc_term_matrix: {0}".format(doc_term_matrix))
|
||||
printlog("id2term: {0}".format(id2term))
|
||||
|
||||
|
||||
|
||||
|
||||
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
|
||||
corpus_name = "de_corpus"
|
||||
|
||||
# load corpus
|
||||
de_corpus = load_corpus(corpus_name=corpus_name,corpus_path=corpus_path)
|
||||
|
||||
|
||||
for i in range(5):
|
||||
printRandomDoc(de_corpus)
|
||||
|
||||
|
||||
|
||||
|
||||
# todo gescheites tf(-idf) maß finden
|
||||
ngrams = 1
|
||||
min_df = 1
|
||||
max_df = 1.0
|
||||
weighting = 'tf'
|
||||
# weighting ='tfidf'
|
||||
named_entities = False
|
||||
|
||||
"""
|
||||
printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting=weighting)
|
||||
printvecotorization(ngrams=1, min_df=1, max_df=0.5, weighting=weighting)
|
||||
printvecotorization(ngrams=1, min_df=1, max_df=0.8, weighting=weighting)
|
||||
|
||||
printvecotorization(ngrams=(1, 2), min_df=1, max_df=1.0, weighting=weighting)
|
||||
printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.5, weighting=weighting)
|
||||
printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.8, weighting=weighting)
|
||||
"""
|
||||
|
||||
# build citionary of ticketcategories
|
||||
labelist = []
|
||||
|
||||
for texdoc in de_corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist):
|
||||
labelist.append(texdoc.metadata["categoryName"])
|
||||
|
||||
LABELDICT = {k: v for v, k in enumerate(labelist)}
|
||||
|
||||
printlog(str("LABELDICT: {0}".format(LABELDICT)))
|
||||
|
||||
|
||||
def textacyTopicModeling(ngrams, min_df, max_df, topicModel='lda', n_topics=len(LABELDICT), named_entities=False,
|
||||
corpus=de_corpus):
|
||||
printlog(
|
||||
"############################################ Topic Modeling {0} #############################################".format(
|
||||
topicModel))
|
||||
print("\n\n")
|
||||
printlog(str("ngrams: {0}".format(ngrams)))
|
||||
printlog(str("min_df: {0}".format(min_df)))
|
||||
printlog(str("max_df: {0}".format(max_df)))
|
||||
printlog(str("n_topics: {0}".format(n_topics)))
|
||||
printlog(str("named_entities: {0}".format(named_entities)))
|
||||
|
||||
start = time.time()
|
||||
|
||||
top_topic_words = 10
|
||||
top_document_labels_per_topic = 5
|
||||
|
||||
# http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix
|
||||
weighting = ('tf' if topicModel == 'lda' else 'tfidf')
|
||||
|
||||
####################'####################
|
||||
|
||||
|
||||
# printlog("vectorize corpus...")
|
||||
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
|
||||
|
||||
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in corpus)
|
||||
doc_term_matrix = vectorizer.fit_transform(terms_list)
|
||||
id2term = vectorizer.__getattribute__("id_to_term")
|
||||
|
||||
# printlog("terms_list: {0}".format(list(terms_list)))
|
||||
# printlog("doc_term_matrix: {0}".format(doc_term_matrix))
|
||||
|
||||
|
||||
|
||||
##################### LSA, LDA, NMF Topic Modeling via Textacy ##############################################
|
||||
|
||||
# Initialize and train a topic model
|
||||
# printlog("Initialize and train a topic model..")
|
||||
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
|
||||
model.fit(doc_term_matrix)
|
||||
|
||||
# Transform the corpus and interpret our model:
|
||||
# printlog("Transform the corpus and interpret our model..")
|
||||
doc_topic_matrix = model.transform(doc_term_matrix)
|
||||
print()
|
||||
|
||||
for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words):
|
||||
printlog('topic {0}: {1}'.format(topic_idx, " ".join(top_terms)))
|
||||
|
||||
print()
|
||||
for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic):
|
||||
printlog(topic_idx)
|
||||
for j in top_docs:
|
||||
printlog(corpus[j].metadata['categoryName'])
|
||||
print()
|
||||
|
||||
#####################################################################################################################
|
||||
print()
|
||||
print()
|
||||
|
||||
end = time.time()
|
||||
printlog("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel))
|
||||
|
||||
|
||||
# no_below = 20
|
||||
# no_above = 0.5
|
||||
|
||||
|
||||
# n_topics = len(LABELDICT)#len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
|
||||
|
||||
|
||||
|
||||
"""
|
||||
topicModeling(ngrams = 1,
|
||||
min_df = 1,
|
||||
max_df = 1.0,
|
||||
topicModel = 'lda',
|
||||
n_topics = len(LABELDICT),
|
||||
corpus=de_corpus)
|
||||
|
||||
topicModeling(ngrams = 1,
|
||||
min_df = 0.1,
|
||||
max_df = 0.6,
|
||||
topicModel = 'lda',
|
||||
n_topics = len(LABELDICT),
|
||||
corpus=de_corpus)
|
||||
|
||||
topicModeling(ngrams = (1,2),
|
||||
min_df = 1,
|
||||
max_df = 1.0,
|
||||
topicModel = 'lda',
|
||||
n_topics = len(LABELDICT),
|
||||
corpus=de_corpus)
|
||||
|
||||
topicModeling(ngrams = (1,2),
|
||||
min_df = 0.1,
|
||||
max_df = 0.6,
|
||||
topicModel = 'lda',
|
||||
n_topics = len(LABELDICT),
|
||||
corpus=de_corpus)
|
||||
|
||||
topicModeling(ngrams = (1,2),
|
||||
min_df = 0.2,
|
||||
max_df = 0.8,
|
||||
topicModel = 'lda',
|
||||
n_topics = 20,
|
||||
corpus=de_corpus)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
|
||||
##################### LLDA Topic Modeling via JGibbsLabledLDA ##############################################
|
||||
|
||||
|
||||
top_topic_words = 15
|
||||
|
||||
print("\n\n")
|
||||
start = time.time()
|
||||
|
||||
n_topics = len(LABELDICT) # len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
|
||||
|
||||
# build citionary of ticketcategories
|
||||
labelist = []
|
||||
|
||||
for texdoc in de_corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist):
|
||||
labelist.append(texdoc.metadata["categoryName"])
|
||||
|
||||
LABELDICT = {k: v for v, k in enumerate(labelist)}
|
||||
print(LABELDICT)
|
||||
|
||||
|
||||
def label2ID(label, labeldict=LABELDICT):
|
||||
return labeldict.get(label, len(labeldict))
|
||||
|
||||
|
||||
def generate_labled_lines(textacyCorpus):
|
||||
for doc in textacyCorpus:
|
||||
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpus
|
||||
yield "[" + str(label2ID(doc.metadata["categoryName"])) + "] " + doc.text
|
||||
|
||||
|
||||
jgibbsLLDA_root = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/java_LabledLDA/"
|
||||
LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
|
||||
|
||||
# create file
|
||||
textacy.fileio.write_file_lines(generate_labled_lines(de_corpus), filepath=LLDA_filepath)
|
||||
|
||||
# todfo ticket drucken
|
||||
# wait for file to exist
|
||||
while not os.path.exists(LLDA_filepath):
|
||||
time.sleep(1)
|
||||
|
||||
print("\n\n")
|
||||
printlog("start LLDA:")
|
||||
# run JGibsslda file
|
||||
FNULL = open(os.devnull, 'w') # supress output
|
||||
subprocess.call(["java",
|
||||
"-cp",
|
||||
"{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format(jgibbsLLDA_root),
|
||||
"jgibblda.LDA",
|
||||
"-est",
|
||||
"-dir", "{0}models/tickets".format(jgibbsLLDA_root),
|
||||
"-dfile", "tickets.gz",
|
||||
"-twords", str(top_topic_words),
|
||||
"-ntopics", str(n_topics)], stdout=FNULL)
|
||||
|
||||
# ANMERKUNG: Dateien sind versteckt. zu finden in models/
|
||||
|
||||
# twords
|
||||
subprocess.call(["gzip",
|
||||
"-dc",
|
||||
"{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)])
|
||||
#####################################################################################################################
|
||||
print()
|
||||
print()
|
||||
|
||||
end = time.time()
|
||||
printlog("\n\n\nTime Elapsed Topic Modeling JGibbsLLDA:{0} min\n\n".format((end - start) / 60))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue