topicmodeling jgibbsllda lauffähig

This commit is contained in:
jannis.grundmann 2017-10-25 09:46:44 +02:00
parent 16d3e1cb70
commit 3137dc6e54
17 changed files with 694 additions and 138 deletions

242
cleaning.py Normal file
View File

@ -0,0 +1,242 @@
# -*- coding: utf-8 -*-
from datetime import datetime
import csv
import sys
from miscellaneous import *
from datetime import datetime
import time
import textacy
from scipy import *
import os
csv.field_size_limit(sys.maxsize)
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
# load config
config_ini = FILEPATH + "config.ini"
config = ConfigParser.ConfigParser()
with open(config_ini) as f:
config.read_file(f)
global REGEX_SPECIALCHAR
global WORDS
REGEX_SPECIALCHAR = r'[`\-=~%^&*()_+\[\]{};\'\\:"|</>]' #+r',.'
WORDS= {}
########################## Spellchecking ##########################################
# http://norvig.com/spell-correct.html
# http://wortschatz.uni-leipzig.de/en/download
import re
def words(text): return re.findall(r'\w+', text.lower())
def P(word, N=sum(WORDS.values())):
"Probability of `word`."
return WORDS[word] / N
def correction(word):
"Most probable spelling correction for word."
return max(candidates(word), key=P)
def candidates(word):
"Generate possible spelling corrections for word."
return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])
def known(words):
"The subset of `words` that appear in the dictionary of WORDS."
return set(w for w in words if w in WORDS)
def edits1(word):
"All edits that are one edit away from `word`."
letters = 'abcdefghijklmnopqrstuvwxyz'
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
deletes = [L + R[1:] for L, R in splits if R]
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
inserts = [L + c + R for L, R in splits for c in letters]
return set(deletes + transposes + replaces + inserts)
def edits2(word):
"All edits that are two edits away from `word`."
return (e2 for e1 in edits1(word) for e2 in edits1(e1))
def autocorrectWord(word):
try:
return correction(word)
except:
return word
############# stringcleaning
def clean(stringstream,autocorrect=False):
for string in stringstream:
# fixUnicode
string = textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC')
# seperate_words_on_regex:
string = " ".join(re.compile(REGEX_SPECIALCHAR).split(string)) #frage ,.?!
#normalize whitespace
string = textacy.preprocess.normalize_whitespace(string)
#remove linebreaks
string = re.sub(r'[\n]', " ", string)
# replaceRockDots
string = re.sub(r'[ß]', "ss", string)
string = re.sub(r'[ö]', "oe", string)
string = re.sub(r'[ü]', "ue", string)
string = re.sub(r'[ä]', "ae", string)
# frage autocorrect?
#idee http://lexitron.nectec.or.th/public/COLING-2010_Beijing_China/POSTERS/pdf/POSTERS022.pdf
if autocorrect:
string = " ".join([autocorrectWord(word) for word in string.split()])
yield string
def processDictstream(dictstream, funcdict, parser):
"""
:param dictstream: dict-gen
:param funcdict:
clean_in_meta = {
"Solution":funclist,
...
}
:param parser: spacy-parser
:return: dict-gen
"""
for dic in dictstream:
result = {}
for key, value in dic.items():
if key in funcdict:
doc = parser(value)
tokens = [tok for tok in doc]
funclist = funcdict[key]
tokens = filterTokens(tokens, funclist)
result[key] = " ".join([tok.lower_ for tok in tokens])
else:
result[key] = value
yield result
def filterTokens(tokens, funclist):
# in:tokenlist, funclist
# out: tokenlist
for f in funclist:
tokens = list(filter(f, tokens))
return tokens
def removePOS(pos_list):
return lambda tok: tok.pos_ not in pos_list
##################################################################################################
path2wordsdict = FILEPATH + config.get("spellchecking", "pickle_file")
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
def cleanCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", printrandom=10):
logprint("Clean {0}_corpus at {1}".format(lang, datetime.now()))
rawCorpus_name = lang + "_raw_ticket"
cleanCorpus_name = lang + "_clean_ticket"
#load raw corpus and create new one
raw_corpus, parser = load_corpus(corpus_name=rawCorpus_name, corpus_path=corpus_path)
clean_corpus = textacy.Corpus(parser)
## process and add files to textacy-corpi,
clean_corpus.add_texts(
clean(corpus2Text(raw_corpus)),
processDictstream(corpus2Meta(raw_corpus), clean_in_meta,parser=parser)
)
# leere docs aus corpi kicken
clean_corpus.remove(lambda doc: len(doc) == 0)
for i in range(printrandom):
printRandomDoc(clean_corpus)
#save corpus
save_corpus(corpus=clean_corpus, corpus_path=corpus_path, corpus_name=cleanCorpus_name)
return clean_corpus
def main():
start = time.time()
WORDS = load_obj(path2wordsdict)
clean_in_content = [] #frage notwendig?
clean_in_meta = {
"Solution": [removePOS(["SPACE"])],
"Subject": [removePOS(["SPACE", "PUNCT"])],
"categoryName": [removePOS(["SPACE", "PUNCT"])]
}
corpus = cleanCorpus(corpus_de_path, clean_in_content, clean_in_meta, "de",printrandom=5 )
end = time.time()
logprint("Time Elapsed Cleaning:{0} min".format((end - start) / 60))
if __name__ == "__main__":
main()

24
cleaning_bsp.txt Normal file
View File

@ -0,0 +1,24 @@
Index: 0
Text: lieber support, ich habe gerade versucht mich mit meiner unicard im firefox browser fuer das service portal zu authentifizieren. das hat vor einigen wochen noch tadelos geklappt und mittlerweile bekomme ich folgende fehlermeldung ich hoffe sie koennen mir weiterhelfen. vielen dank und viele gruesse sascha feldhorst dipl. inform. sascha feldhorst wiss. ang. technische universitaet dortmund maschinenbau lehrstuhl fuer foerder und lagerwesen logistikcampus joseph von fraunhofer str. 2 4 d 44227 dortmund tel. 49 231 755 40 73 fax 49 231 755 47 68 mailto sascha.feldhorst@tu dortmund.de sascha.feldhorst@tu dortmund.de http www.flw.mb.tu dortmund.de www.flw.mb.tu dortmund.de wichtiger hinweis die information in dieser e mail ist vertraulich. sie ist ausschliesslich fuer den adressaten bestimmt. sollten sie nicht der fuer diese e mail bestimmte adressat sein, unterrichten sie bitte den absender und vernichten sie diese mail. vielen dank. unbeschadet der korrespondenz per e mail, sind unsere erklaerungen ausschliesslich final rechtsverbindlich, wenn sie in herkoemmlicher schriftform mit eigenhaendiger unterschrift oder durch uebermittlung eines solchen schriftstuecks per telefax erfolgen. important note the information included in this e mail is confidential. it is solely intended for the recipient. if you are not the intended recipient of this e mail please contact the sender and delete this message. thank you. without prejudice of e mail correspondence, our statements are only legally binding when they are made in the conventional written form with personal signature or when such documents are sent by fax.
categoryName: betrieb
Index: 0
Text: support browser service portal mittlerweile
categoryName: betrieb
Index: 1
Text: telefon umzug antragsteller astrid gramm astrid.gramm@tu dortmund.de terminvorschlag 14.08.2015 einrichtung dezernat 2 abteilung 2.5 psp element uniaccount mnichofm hofmann, nicole gebaeude dezernat 5 raum id 201 651430 telefondose neztwerkdose dt04.5 04.6 telefonnr. 4821 eintrag telefonbuch e mail astrid.gramm@tu dortmund.de voicemail ansprechpartner astrid gramm tel. ansprechpartner 5444 verantwortlicher nutzer type bemerkung frau hofmann wird am 14.08.2015 in die wd 2 umziehen. es ist der raum 201a im og nicht 201 eine bezeichnung der telefondose ist nicht vorhanden.
categoryName: elektronisches telefonbuch
Index: 1
Text: telefon umzug antragsteller gramm einrichtung dezernat abteilung element gebaeude dezernat raum id eintrag telefonbuch mail ansprechpartner gramm ansprechpartner verantwortlicher nutzer type bemerkung raum bezeichnung
categoryName: elektronisches telefonbuch

View File

@ -41,10 +41,8 @@ filename=topicModelTickets.log
[de_corpus]
#input=M42-Export/Tickets_med.csv
#input=M42-Export/Tickets_small.csv
#input=M42-Export/Tickets_mini.csv
input=M42-Export/de_tickets.csv
input=M42-Export/Tickets_small.csv
#input=M42-Export/de_tickets.csv
path=corpi/

View File

@ -97,6 +97,7 @@ metaliste = [
content_collumn_name = config.get("tickets","content_collumn_name")
metaliste = list(map(normalize_whitespace,config.get("tickets","metaliste").split(",")))
path2de_csv = FILEPATH + config.get("de_corpus","input")
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
@ -114,7 +115,7 @@ def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, la
path_csv_split = path2_csv.split("/")
filename = path_csv_split[len(path_csv_split) - 1]
printlog("Corporization of {0} at {1}".format(filename,datetime.now()))
logprint("Corporization of {0} at {1}".format(filename, datetime.now()))
raw_corpus = textacy.Corpus(lang)
@ -139,7 +140,7 @@ def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, la
# save corpus
raw_name = lang + "_raw_ticket"
save_corpus(corpus=raw_corpus, corpus_path=corpus_path, corpus_name=raw_name)
printlog("Done")
logprint("Done")
def main():
@ -152,7 +153,7 @@ def main():
end = time.time()
printlog("Time Elapsed Corporization:{0} min".format((end - start) / 60))
logprint("Time Elapsed Corporization:{0} min".format((end - start) / 60))
if __name__ == "__main__":

16
init.py
View File

@ -272,47 +272,47 @@ path2stopwordlist_en = FILEPATH + config.get("en_stopwords","pickle_file")
def main():
start = time.time()
printlog("Init: {0}".format(datetime.now()))
logprint("Init: {0}".format(datetime.now()))
printlog("create and save lemma_dict")
logprint("create and save lemma_dict")
lemma_dict = create_lemma_dict(path2lemma_file)
save_obj(lemma_dict, path2lemmadict)
printlog("Build and save Wordlist for Spellchecking")
logprint("Build and save Wordlist for Spellchecking")
words = build_words_for_spellchecking(path2words_file)
save_obj(words, path2wordlist)
printlog("Build and save Thesaurus")
logprint("Build and save Thesaurus")
thesaurus = build_thesaurus_dict(path2wordnet)
save_obj(thesaurus, path2thesaurus_dict)
printlog("Build and save stoppwortliste")
logprint("Build and save stoppwortliste")
de_stop_words, en_stop_words = create_stopword_lists(stop1, stop2, stop3)
save_obj(de_stop_words, path2stopwordlist_de)
save_obj(en_stop_words, path2stopwordlist_en)
printlog("Build and save nomenliste")
logprint("Build and save nomenliste")
nouns = list_from_files(nouns1,nouns2)
save_obj(nouns, path2nouns_list)
printlog("Build and save firstnameslist")
logprint("Build and save firstnameslist")
vornamen = list_from_files(firstnames_txt)
save_obj(vornamen, path2firstnameslist)
end = time.time()
printlog("Time Elapsed Initialization:{0} min".format((end - start) / 60))
logprint("Time Elapsed Initialization:{0} min".format((end - start) / 60))

20
main.py
View File

@ -1,29 +1,35 @@
# -*- coding: utf-8 -*-
import time
import init
import corporization
import preprocessing
import topicModeling
import cleaning
from miscellaneous import *
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/main.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_main.log &"
start = time.time()
import init
init.main()
printlog("")
logprint("")
corporization.main()
printlog("")
logprint("")
cleaning.main()
logprint("")
preprocessing.main()
printlog("")
logprint("")
topicModeling.main()
printlog("")
topicModeling.main(use_raw=False)
logprint("")
topicModeling.main(use_raw=True)
logprint("")
end = time.time()
printlog("Total Time Elapsed: {0} min".format((end - start) / 60))
logprint("Total Time Elapsed: {0} min".format((end - start) / 60))

View File

@ -12,6 +12,10 @@ import spacy
import textacy
from scipy import *
import os
import glob, os
from textacy.fileio import open_sesame
import json
from spacy.tokens.doc import Doc as SpacyDoc
csv.field_size_limit(sys.maxsize)
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
@ -40,7 +44,7 @@ logging.basicConfig(filename=filename, level=level)
def printlog(string, level="INFO"):
def logprint(string, level="INFO"):
"""log and prints"""
print(string)
if level == "INFO":
@ -91,6 +95,7 @@ def load_obj(path):
with open(path, 'rb') as f:
return pickle.load(f)
def replaceRockDots():
return lambda string: re.sub(r'[ß]', "ss",
(re.sub(r'[ö]', "oe",
@ -117,7 +122,19 @@ def list_from_files(*paths):
return list(map(textacy.preprocess.normalize_whitespace, liste))
def deprecated(func):
"""This is a decorator which can be used to mark functions
as deprecated. It will result in a warning being emmitted
when the function is used."""
@functools.wraps(func)
def new_func(*args, **kwargs):
warnings.simplefilter('always', DeprecationWarning) #turn off filter
warnings.warn("Call to deprecated function {}.".format(func.__name__), category=DeprecationWarning, stacklevel=2)
warnings.simplefilter('default', DeprecationWarning) #reset filter
return func(*args, **kwargs)
return new_func
def printRandomDoc(textacyCorpus):
@ -127,17 +144,26 @@ def printRandomDoc(textacyCorpus):
"""
print()
if len(textacyCorpus) == 0:
printlog("NO DOCS IN CORPUS")
logprint("NO DOCS IN CORPUS")
else:
printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
#printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
randIndex = int((len(textacyCorpus) - 1) * random.random())
printlog("Index: {0} \n Text: {1} \n Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text,
logprint("Index: {0} \n Text: {1} \n categoryName: {2}\n".format(randIndex, textacyCorpus[randIndex].text,
textacyCorpus[randIndex].metadata['categoryName']))
print()
def corpus2Text(corpus):
for doc in corpus:
yield doc.text
def corpus2Meta(corpus):
for doc in corpus:
yield doc.metadata
def saveplaincorpustext(corpus,path):
textacy.fileio.write_file_lines(corpus2Text(corpus),filepath=path )
@ -163,10 +189,16 @@ def save_corpus(corpus, corpus_path, corpus_name):
parserpath = corpus_path + str(parser.lang) + '_parser'
parser.save_to_directory(parserpath)
##
# save content
contentpath = corpus_path + corpus_name + "_content.bin"
textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus), contentpath)
#save plain content
plainpath = corpus_path + corpus_name + "_content.json"
textacy.fileio.write_json_lines(({"index" : doc.corpus_index, "content" : doc.text} for doc in corpus), plainpath)
# save meta
metapath = corpus_path + corpus_name + "_meta.json"
textacy.fileio.write_json_lines((doc.metadata for doc in corpus), metapath)
@ -175,6 +207,7 @@ def save_corpus(corpus, corpus_path, corpus_name):
def load_corpus(corpus_path, corpus_name, lang="de"):
"""
Load textacy-Corpus including spacy-parser out from file
@ -207,16 +240,115 @@ def load_corpus(corpus_path, corpus_name, lang="de"):
contentpath = corpus_path + corpus_name + "_content.bin"
plainpath = corpus_path + corpus_name + "_content.json"
metapath = corpus_path + corpus_name + "_meta.json"
metadata_stream = textacy.fileio.read_json_lines(metapath)
try:
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
metadata_stream = textacy.fileio.read_json_lines(metapath)
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
corpus.add_doc(
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
except:
# neu init!!
corpus = textacy.Corpus(parser)
plain_stream = textacy.fileio.read_json_lines(plainpath) # yields {int : str}
metadata_stream = textacy.fileio.read_json_lines(metapath)
for plain, metadata in zip(plain_stream, metadata_stream):
corpus.add_doc(
textacy.Doc(plain["content"], lang=corpus.spacy_lang, metadata=metadata))
return corpus, corpus.spacy_lang
def save_corpusV2(corpus, corpus_path, corpus_name):
"""
saves a textacy-corpus including spacy-parser
:param corpus: textacy-Corpus
:param corpus_path: str
:param corpus_name: str (should content the language like "_de_")
"""
# save parser
parser = corpus.spacy_lang
parserpath = corpus_path + str(parser.lang) + '_parser'
parser.save_to_directory(parserpath)
contentpath = corpus_path +corpus_name + "_docs/"
if not os.path.exists(contentpath):
os.makedirs(contentpath)
for doc in corpus:
with open(contentpath + str(doc.corpus_index) + "_doc.bin", 'w') as f:
f.write(doc.spacy_doc.to_bytes())
with open(contentpath + str(doc.corpus_index) + "_meta.json", 'w') as file:
file.write(json.dumps(doc.metadata))
def load_corpusV2(corpus_path, corpus_name, lang="de"):
"""
Load textacy-Corpus including spacy-parser out from file
:param corpus_path: str
:param corpus_name: str (should content the language like "_de_")
:param lang: str (language code) ir spacy.Language
:return: texracy.Corpus, spacy.language
"""
# ckeck for language
if "de_" in corpus_name:
lang = "de"
elif "en_" in corpus_name:
lang = "en"
# load parser
parser = spacy.load(lang)
stringstorepath = corpus_path + str(lang) + '_parser' + '/vocab/strings.json'
with open(stringstorepath) as file:
parser.vocab.strings.load(file)
vocabpath = Path(corpus_path + str(lang) + '_parser' + '/vocab/lexemes.bin')
parser.vocab.load_lexemes(vocabpath)
# load corpus
corpus = textacy.Corpus(parser)
contentpath = corpus_path + corpus_name + "_docs/"
docs = yield_fromdir(contentpath,spacy_vocab=corpus.spacy_vocab,type="doc")
metas = yield_fromdir(contentpath,type="meta")
for doc,meta in zip(docs,metas):
corpus.add_doc(
textacy.Doc(doc, lang=corpus.spacy_lang, metadata=meta))
return corpus, corpus.spacy_lang
def yield_fromdir(path,spacy_vocab=None,type=".pkl"):
os.chdir(path)
filelist = [name for name in os.listdir('.') if os.path.isfile(name)]
filelist = [filename for filename in filelist if type in filename]
filelist.sort(key = lambda elem : elem.split("_")[0])
if type =='doc':
for filename in filelist:
with open(path+filename,'r') as f:
for bytes_string in SpacyDoc.read_bytes(f):
yield SpacyDoc(spacy_vocab).from_bytes(bytes_string)
elif type == 'meta':
for filename in filelist:
with open(path+filename,'r') as f:
yield json.load(f)
else:
for filename in filelist:
yield load_obj(path+filename)

View File

@ -1,3 +1,5 @@
unicard
uniaccount
kernspaltung
kernfission
atomspaltung

View File

@ -27,12 +27,6 @@ with open(config_ini) as f:
global REGEX_SPECIALCHAR
global REGEX_TOPLVL
REGEX_SPECIALCHAR = r'[`\-=~%^&*()_+\[\]{};\'\\:"|</>]'
REGEX_TOPLVL = r'\.[a-z]{2,3}(\.[a-z]{2,3})?'
global THESAURUS
global WORDS
global LEMMAS
@ -41,6 +35,10 @@ global VORNAMEN
global DE_STOP_WORDS
global EN_STOP_WORDS
REGEX_SPECIALCHAR = r'[`\-=~%^&*()_+\[\]{};\'\\:"|</>]' #+r',.'
REGEX_TOPLVL = r'\.[a-z]{2,3}(\.[a-z]{2,3})?'
THESAURUS = {}
WORDS= {}
LEMMAS= {}
@ -185,7 +183,7 @@ def autocorrectWord(word):
############# stringcleaning
@deprecated
def stringcleaning(stringstream):
@ -225,7 +223,6 @@ def stringcleaning(stringstream):
def filterTokens(tokens, funclist):
# in:tokenlist, funclist
# out: tokenlist
@ -257,20 +254,6 @@ def processContentstream2(textstream, parser, token_filterlist=None):
def preparse(stringstream):
for string in stringstream:
# fixUnicode
string = textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC')
# seperate_words_on_regex:
string = " ".join(re.compile(REGEX_SPECIALCHAR).split(string))
#normalize whitespace
string = textacy.preprocess.normalize_whitespace(string)
# replaceRockDots
string = re.sub(r'[ß]', "ss", string)
string = re.sub(r'[ö]', "oe", string)
string = re.sub(r'[ü]', "ue", string)
string = re.sub(r'[ä]', "ae", string)
# cut_after
# todo addressen enfernen --> postal.parser idee zu metadaten hinzufügen
@ -312,6 +295,7 @@ def corpus2Meta(corpus):
for doc in corpus:
yield doc.metadata
@deprecated
def processContentstream(textstream, parser, token_filterlist=None):
"""
:param textstream: string-gen
@ -398,21 +382,22 @@ corpus_en_path = FILEPATH + config.get("en_corpus", "path")
def preprocessCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", printrandom=10):
printlog("Preprocess {0}_corpus at {1}".format(lang,datetime.now()))
logprint("Preprocess {0}_corpus at {1}".format(lang, datetime.now()))
rawCorpus_name = lang + "_raw_ticket"
cleanCorpus_name = lang + "_clean_ticket"
preCorpus_name = lang + "_pre_ticket"
logprint("Load {0}_raw".format(lang))
#load raw corpus and create new one
raw_corpus, parser = load_corpus(corpus_name=rawCorpus_name, corpus_path=corpus_path)
clean_corpus, parser = load_corpus(corpus_name=cleanCorpus_name, corpus_path=corpus_path)
corpus = textacy.Corpus(parser)
## process and add files to textacy-corpi,
corpus.add_texts(
processContentstream2(corpus2Text(raw_corpus), token_filterlist=filter_tokens, parser=parser),
processDictstream(corpus2Meta(raw_corpus), clean_in_meta,parser=parser)
processContentstream2(corpus2Text(clean_corpus), token_filterlist=filter_tokens, parser=parser),
processDictstream(corpus2Meta(clean_corpus), clean_in_meta,parser=parser)
)
@ -429,6 +414,16 @@ def preprocessCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", print
save_corpus(corpus=corpus, corpus_path=corpus_path, corpus_name=preCorpus_name)
#save corpus as labled, plain text
plainpath = FILEPATH + config.get("de_corpus", "path") + "labled_lines.txt"
textacy.fileio.write_file_lines(labledCorpiLines(corpus),filepath=plainpath )
return corpus
def labledCorpiLines(corpus):
for doc in corpus:
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
yield "[" + doc.metadata["categoryName"] + "] " + doc.text
def main():
@ -468,12 +463,16 @@ def main():
"categoryName": [removePOS(["SPACE", "PUNCT"])]
}
preprocessCorpus(corpus_de_path, filter_tokens, clean_in_meta, "de" )
corpus = preprocessCorpus(corpus_de_path, filter_tokens, clean_in_meta, "de",printrandom=5)
#from topicModeling import jgibbsLLDA
#jgibbsLLDA(corpus)
#preprocessCorpus(corpus_en_path, filter_tokens, clean_in_meta, "en" )
end = time.time()
printlog("Time Elapsed Preprocessing:{0} min".format((end - start) / 60))
logprint("Time Elapsed Preprocessing:{0} min".format((end - start) / 60))
if __name__ == "__main__":
main()

View File

@ -15,11 +15,26 @@ start = time.time()
import enchant
from datetime import datetime
import os
import xml.etree.ElementTree as ET
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
from miscellaneous import *
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModeling.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_topicModeling.log &"
parser = spacy.load("de")
"""
# load config
config_ini = FILEPATH + "config.ini"
config = ConfigParser.ConfigParser()
with open(config_ini) as f:
config.read_file(f)
PARSER=spacy.load("de")
@ -48,13 +63,74 @@ def makemeta( testmetda):
yield metdata
def corpus2Text(corpus):
for doc in corpus:
yield doc.text
corpi.add_texts(
makecontent(testcontetn),
makemeta(testmetda)
)
corpus_de_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/test/"
rawCorpus_name = "de_test_ticket"
print(corpi)
#save_corpusV2(corpi,corpus_path=corpus_de_path,corpus_name=rawCorpus_name)
#textacy.fileio.write_file_lines(corpus2Text(corpi), filepath=corpus_de_path+"plain.txt")
dict = {"unicard redaktionsteam": 189, "kms": 131, "itmc_st\u00f6rungen": 17, "benutzerverwaltung_probleme": 168, "mailverteiler exchange": 130, "beamer": 70, "cws_confluence": 190, "benutzerverwaltung": 26, "sos": 166, "virtuelle server": 116, "sap": 7, "wlan": 21, "lsf": 6, "gastaufenthalt": 8, "umzug": 5, "firewall betreuung": 129, "ausleihe": 39, "fiona": 10, "kursplanung": 195, "schulungsraum verwaltung": 200, "plagiatserkennung": 32, "designentwicklung": 100, "ub basis it": 184, "tsm": 51, "backup tsm": 110, "raumkalender": 174, "veeam": 149, "linux bs": 42, "hochleistungsrechnen": 90, "e learning": 37, "h\u00f6rsaal\u00fcbertragung": 52, "sophos": 88, "service portal redaktion": 182, "verkauf": 93, "fk 16": 30, "campus app": 54, "dns": 71, "kurse": 196, "itmc schulungsr\u00e4ume": 96, "leitung": 91, "telefon": 14, "housing": 135, "softwarelizenzen": 35, "hcm stammdaten": 68, "semesterticket": 197, "exchange nutzung": 33, "mediendienste": 167, "sam spider": 172, "pvp": 27, "webserver": 29, "werkvertr\u00e4ge": 158, "ibz raumbuchung": 177, "webmailer": 126, "unicard sperrung": 64, "cd dvd produktion": 114, "lizenzserver": 92, "pr\u00fcfungsmanagement": 38, "blogs wikis foren": 87, "unicard ausgabe": 161, "pools": 157, "desktop & basisdienste": 144, "antrag auf rechnungserstellung": 193, "mailalias": 121, "evaexam": 133, "neuanschluss": 0, "mobilfunkvertr\u00e4ge": 69, "ftp server": 191, "haustechnik": 77, "raumbuchungssysteme": 186, "confluence": 181, "uniaccount zugangsdaten": 47, "itmc medienr\u00e4ume ef50": 171, "dokoll support": 128, "elektronisches telefonbuch": 3, "softwareverteilung": 153, "overhead projektor": 104, "sicherheit": 145, "itmc_als": 48, "itmc pools": 160, "zhb": 60, "serversupport": 101, "veranstaltungen": 61, "fk12 webauftritt": 138, "hardware": 142, "unicard produktion": 156, "telefonkonferenzen": 170, "dhcp": 188, "zertifikate server dfn": 139, "lan": 1, "datanet": 49, "neuausstattung": 173, "moodle": 16, "abmeldung": 13, "uni mail": 15, "medienr\u00e4ume ef50": 117, "verschiedene aufgaben": 40, "zentrale webserver": 75, "vorlesungsaufzeichnung": 152, "grafik": 132, "campus management": 72, "hacker angriff": 46, "pos": 23, "zugangsdaten": 41, "serviceportal": 63, "ews": 24, "voicemail box": 150, "service desk itmc": 74, "test": 180, "beschaffung": 57, "bestellung": 185, "vpn": 55, "app feedback": 66, "allgemein": 134, "rundmail": 105, "telefonabrechnung": 199, "limesurvey": 31, "unicard": 28, "eldorado": 140, "uniaccount": 12, "plotter": 125, "mdm mobile device management": 120, "namens\u00e4nderung": 43, "sd": 84, "basis applikationen": 103, "\u00e4nderung": 194, "fileserver einrichtung": 187, "fk14_test": 154, "werkst\u00e4tte": 179, "itmc_aufgaben": 45, "formulare antr\u00e4ge": 81, "facility": 192, "web": 169, "asknet": 136, "server storage": 113, "mail groupware": 20, "rektorat -b\u00fcro": 178, "office": 50, "werkstoffe lehrstuhl bauwesen": 59, "telefonzentrale": 115, "verwaltung": 4, "netze": 22, "beantragung": 82, "d.3 dms": 148, "redmine projektverwaltung": 141, "wsus": 106, "lido": 118, "rechnerr\u00e4ume": 143, "matrix42_hilfe": 18, "boss service desk": 44, "konteneinsicht": 62, "spam phishing": 53, "forensic": 164, "fk 12": 11, "benutzungsverwaltung": 198, "redmine": 79, "basis app": 85, "viren": 95, "fk12 migration": 155, "raumbuchung": 109, "virtuelle desktops citrix": 176, "outlook_einrichtung": 123, "kundenserver": 137, "nrw ticket": 80, "weiterentwicklung": 127, "siport zugangskontrolle": 98, "e mail dienste": 99, "vorlagenerstellung": 36, "video": 19, "studierendensekretariat": 111, "it sicherheit sic": 86, "boss": 25, "technik": 58, "dokoll pvp": 112, "betrieb": 2, "v2 campus app feedback": 151, "mailverteiler": 108, "videoschnitt": 119, "fk raumplanung 09": 9, "sap urlaub": 73, "keine r\u00fcckantwort": 124, "prozess- und projektmanagement": 67, "dienstreise": 34, "webgestaltung": 78, "schulung": 175, "software": 89, "medientechnik": 76, "servicedesk": 107, "service portal": 94, "software entwicklung": 165, "uniflow": 159, "ub_st\u00f6rungen": 162, "fk15": 183, "uhren": 83, "entwicklung": 163, "videokonferenzen": 97, "itmc webauftritt": 102, "joomla itmc website": 147, "changes": 122, "visitenkartenproduktion": 65, "lizenzmanagement": 146, "tonerb\u00f6rse": 201, "arbeitsplatzsupport": 56}
list = [(key,value) for key,value in dict.items()]
list.sort(key=lambda tup : tup[1])
"""
"""
from spacy.tokens.doc import Doc as SpacyDoc
filepath = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/de_clean_ticket_content.bin"
# load parser
parser = spacy.load("de")
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/"
stringstorepath = corpus_path + 'de_parser/vocab/strings.json'
with open(stringstorepath) as file:
parser.vocab.strings.load(file)
vocabpath = Path(corpus_path + 'de_parser/vocab/lexemes.bin')
parser.vocab.load_lexemes(vocabpath)
spacy_vocab = parser.vocab
def readCorpus(filepath):
with open_sesame(filepath, mode='rb') as f:
for bytes_string in SpacyDoc.read_bytes(f):
yield SpacyDoc(spacy_vocab).from_bytes(bytes_string).text
textacy.fileio.write_file_lines(readCorpus(filepath),"/home/jannis.grundmann/PycharmProjects/topicModelingTickets/result.txt")
"""
# load raw corpus and create new one
#raw_corpus, parser = load_corpusV2(corpus_name=rawCorpus_name, corpus_path=corpus_de_path)
#printRandomDoc(raw_corpus)
"""
spacy_doc = PARSER("test")
save_obj(spacy_doc, "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/doc.pkl")
spacy_doc2 = load_obj("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/doc.pkl")
print("Doc: {0}".format(spacy_doc2))
jgibbsLLDA_root = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/"
@ -63,6 +139,7 @@ laveldict = {'fiona': 10, 'vorlagenerstellung': 36, 'webserver': 29, 'matrix42_h
with open(LLDA_filepath, 'w') as file:
file.write(json.dumps(laveldict))
"""
"""
def load_corpus(corpus_path, corpus_name, lang="de"):
from pathlib import Path

View File

@ -31,13 +31,21 @@ with open(config_ini) as f:
config.read_file(f)
def label2ID(label, labeldict):
return labeldict.get(label, len(labeldict))
def generate_labled_lines(textacyCorpus, labeldict):
for doc in textacyCorpus:
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
yield "[" + str(label2ID(doc.metadata["categoryName"], labeldict)) + "] " + doc.text
def printvecotorization(de_corpus, ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_entities=True):
printlog(str("ngrams: {0}".format(ngrams)))
printlog(str("min_df: {0}".format(min_df)))
printlog(str("max_df: {0}".format(max_df)))
printlog(str("named_entities: {0}".format(named_entities)))
logprint(str("ngrams: {0}".format(ngrams)))
logprint(str("min_df: {0}".format(min_df)))
logprint(str("max_df: {0}".format(max_df)))
logprint(str("named_entities: {0}".format(named_entities)))
# printlog("vectorize corpi...")
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
@ -48,19 +56,20 @@ def printvecotorization(de_corpus,ngrams=1, min_df=1, max_df=1.0, weighting='tf'
for t in terms_list:
print(t)
printlog("doc_term_matrix: {0}".format(doc_term_matrix))
printlog("id2term: {0}".format(id2term))
logprint("doc_term_matrix: {0}".format(doc_term_matrix))
logprint("id2term: {0}".format(id2term))
def textacyTopicModeling(ngrams, min_df, max_df, corpus, n_topics, topicModel='lda', named_entities=False):
printlog(
logprint(
"############################################ Topic Modeling {0} #############################################".format(
topicModel))
print("\n\n")
printlog(str("ngrams: {0}".format(ngrams)))
printlog(str("min_df: {0}".format(min_df)))
printlog(str("max_df: {0}".format(max_df)))
printlog(str("n_topics: {0}".format(n_topics)))
printlog(str("named_entities: {0}".format(named_entities)))
logprint(str("ngrams: {0}".format(ngrams)))
logprint(str("min_df: {0}".format(min_df)))
logprint(str("max_df: {0}".format(max_df)))
logprint(str("n_topics: {0}".format(n_topics)))
logprint(str("named_entities: {0}".format(named_entities)))
start = time.time()
@ -98,13 +107,13 @@ def textacyTopicModeling(ngrams, min_df, max_df, corpus, n_topics, topicModel='
print()
for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words):
printlog('topic {0}: {1}'.format(topic_idx, " ".join(top_terms)))
logprint('topic {0}: {1}'.format(topic_idx, " ".join(top_terms)))
print()
for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic):
printlog(topic_idx)
logprint(topic_idx)
for j in top_docs:
printlog(corpus[j].metadata['categoryName'])
logprint(corpus[j].metadata['categoryName'])
print()
#####################################################################################################################
@ -112,40 +121,34 @@ def textacyTopicModeling(ngrams, min_df, max_df, corpus, n_topics, topicModel='
print()
end = time.time()
printlog("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel))
logprint("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel))
def jgibbsLLDA(de_corpus, top_topic_words):
def jgibbsLLDA(corpus, path2save_results, top_topic_words=15, add_default_topic=False):
##################### LLDA Topic Modeling via JGibbsLabledLDA ##############################################
start = time.time()
def label2ID(label, labeldict):
return labeldict.get(label, len(labeldict))
def generate_labled_lines(textacyCorpus,labeldict):
for doc in textacyCorpus:
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
yield "[" + str(label2ID(doc.metadata["categoryName"],labeldict)) + "] " + doc.text
# build citionary of ticketcategories
labelist = []
for texdoc in de_corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist):
for texdoc in corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist):
labelist.append(texdoc.metadata["categoryName"])
labeldict = {k: v for v, k in enumerate(labelist)}
if add_default_topic:
n_topics = len(labeldict) + 1 # len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
else:
n_topics = len(labeldict) # + 1 # len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
jgibbsLLDA_root = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/java_LabledLDA/"
jgibbsLLDA_root = FILEPATH + "/java_LabledLDA/"
LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
dict_path = "{0}models/tickets/labeldict.txt".format(jgibbsLLDA_root)
# printlog(str("LABELDICT: {0}".format(labeldict)))
printlog(str("LABELDICT-length: {0}".format(len(labeldict))))
#logprint(str("LABELDICT-length: {0}".format(len(labeldict))))
with open(dict_path, 'w') as file:
file.write(json.dumps(labeldict))
@ -153,59 +156,107 @@ def jgibbsLLDA(de_corpus, top_topic_words):
# print(line)
# create file
textacy.fileio.write_file_lines(generate_labled_lines(de_corpus,labeldict), filepath=LLDA_filepath)
textacy.fileio.write_file_lines(generate_labled_lines(corpus, labeldict), filepath=LLDA_filepath)
# wait for file to exist
while not os.path.exists(LLDA_filepath):
time.sleep(1)
"""
printlog("")
printlog("start LLDA:")
logprint("")
logprint("start LLDA:")
# run JGibsslda file
FNULL = open(os.devnull, 'w') # supress output
subprocess.call(["java",
"-cp",
cmd_jgibbs_java = ["java", "-cp",
"{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format(
jgibbsLLDA_root),
"jgibblda.LDA",
"-est",
"-dir", "{0}models/tickets".format(jgibbsLLDA_root),
"-dfile", "tickets.gz",
"-twords", str(top_topic_words),
"-ntopics", str(n_topics)], stdout=FNULL)
"jgibblda.LDA", "-est", "-dir", "{0}models/tickets".format(jgibbsLLDA_root), "-dfile",
"tickets.gz",
"-twords", str(top_topic_words), "-ntopics", str(n_topics)]
subprocess.call(cmd_jgibbs_java, stdout=FNULL)
# ANMERKUNG: Dateien sind versteckt. zu finden in models/
# twords
"""
subprocess.call(["gzip",
"-dc",
"{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)])
#####################################################################################################################
printlog("")
"""
cmd_gzip = ["gzip", "-dc", "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)]
"""
proc = subprocess.Popen(cmd_gzip, stdout=subprocess.PIPE)
process = subprocess.Popen(cmd_gzip, shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
# wait for the process to terminate
out, err = process.communicate()
errcode = process.returncode
result = subprocess.check_output(cmd_gzip)
#result = proc.stdout.read()
result = proc.communicate()
out=[]
for line in result:
out.append(line)
"""
output = subprocess.check_output(cmd_gzip).decode("utf-8")
reverse_labeldict = {v: k for k, v in labeldict.items()}
result = []
regex = re.compile(r'Topic [0-9]')
for line in output.splitlines():
findall = regex.findall(line)
if len(findall) != 0:
try:
index = int(findall[0].split()[1])
result.append("Topic {} {}:".format(index, reverse_labeldict[index]))
except:
result.append(line)
else:
result.append(line)
textacy.fileio.write_file_lines(result, path2save_results)
#####################################################################################################################
logprint("")
end = time.time()
printlog("\n\n\nTime Elapsed Topic Modeling JGibbsLLDA:{0} min\n\n".format((end - start) / 60))
logprint("\n\n\nTime Elapsed Topic Modeling JGibbsLLDA:{0} min\n\n".format((end - start) / 60))
def main():
printlog("Topic Modeling: {0}".format(datetime.now()))
def main(use_raw=False):
logprint("Topic Modeling: {0}".format(datetime.now()))
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
if use_raw:
preCorpus_name = "de" + "_raw_ticket"
else:
preCorpus_name = "de" + "_pre_ticket"
# load raw corpus and create new one
de_corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path)
printlog("Corpus loaded: {0}".format(de_corpus.lang))
logprint("Corpus loaded: {0}".format(de_corpus.lang))
# idee http://bigartm.org/
# idee http://wiki.languagetool.org/tips-and-tricks
# idee https://en.wikipedia.org/wiki/Noisy_text_analytics
# idee https://gate.ac.uk/family/
# todo gescheites tf(-idf) maß finden
# todo pro model: gelabelten corpus, ergebnisse und labeldict speichern
# todo topics zusammenfassen
# frage wieviele tickets pro topic?
ngrams = 1
min_df = 1
max_df = 1.0
@ -213,7 +264,6 @@ def main():
# weighting ='tfidf'
named_entities = False
"""
printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting=weighting)
printvecotorization(ngrams=1, min_df=1, max_df=0.5, weighting=weighting)
@ -224,8 +274,34 @@ def main():
printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.8, weighting=weighting)
"""
if use_raw:
resultspath = FILEPATH + "results/raw"
else:
resultspath = FILEPATH + "results/pre"
jgibbsLLDA(de_corpus,15)
top_topic_words = 5
add_default_topic = False
path2save_results = resultspath + "{}_{}.txt".format(top_topic_words, add_default_topic)
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
add_default_topic=add_default_topic)
top_topic_words = 5
add_default_topic = True
path2save_results = resultspath + "{}_{}.txt".format(top_topic_words, add_default_topic)
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
add_default_topic=add_default_topic)
top_topic_words = 10
add_default_topic = False
path2save_results = resultspath + "{}_{}.txt".format(top_topic_words, add_default_topic)
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
add_default_topic=add_default_topic)
top_topic_words = 10
add_default_topic = True
path2save_results = resultspath + "{}_{}.txt".format(top_topic_words, add_default_topic)
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
add_default_topic=add_default_topic)
# no_below = 20
# no_above = 0.5
@ -280,7 +356,6 @@ def main():
"""
if __name__ == "__main__":
main()