topicmodeling jgibbsllda lauffähig
This commit is contained in:
parent
16d3e1cb70
commit
3137dc6e54
|
@ -0,0 +1,242 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datetime import datetime
|
||||
import csv
|
||||
import sys
|
||||
from miscellaneous import *
|
||||
from datetime import datetime
|
||||
import time
|
||||
import textacy
|
||||
from scipy import *
|
||||
|
||||
import os
|
||||
|
||||
csv.field_size_limit(sys.maxsize)
|
||||
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
|
||||
|
||||
|
||||
|
||||
# load config
|
||||
config_ini = FILEPATH + "config.ini"
|
||||
|
||||
config = ConfigParser.ConfigParser()
|
||||
with open(config_ini) as f:
|
||||
config.read_file(f)
|
||||
|
||||
|
||||
global REGEX_SPECIALCHAR
|
||||
|
||||
global WORDS
|
||||
|
||||
|
||||
REGEX_SPECIALCHAR = r'[`\-=~%^&*()_+\[\]{};\'\\:"|</>]' #+r',.'
|
||||
|
||||
WORDS= {}
|
||||
|
||||
|
||||
########################## Spellchecking ##########################################
|
||||
# http://norvig.com/spell-correct.html
|
||||
# http://wortschatz.uni-leipzig.de/en/download
|
||||
|
||||
import re
|
||||
|
||||
|
||||
def words(text): return re.findall(r'\w+', text.lower())
|
||||
|
||||
def P(word, N=sum(WORDS.values())):
|
||||
"Probability of `word`."
|
||||
return WORDS[word] / N
|
||||
|
||||
|
||||
def correction(word):
|
||||
"Most probable spelling correction for word."
|
||||
return max(candidates(word), key=P)
|
||||
|
||||
|
||||
def candidates(word):
|
||||
"Generate possible spelling corrections for word."
|
||||
return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])
|
||||
|
||||
|
||||
def known(words):
|
||||
"The subset of `words` that appear in the dictionary of WORDS."
|
||||
return set(w for w in words if w in WORDS)
|
||||
|
||||
|
||||
def edits1(word):
|
||||
"All edits that are one edit away from `word`."
|
||||
letters = 'abcdefghijklmnopqrstuvwxyz'
|
||||
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
|
||||
deletes = [L + R[1:] for L, R in splits if R]
|
||||
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
|
||||
replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
|
||||
inserts = [L + c + R for L, R in splits for c in letters]
|
||||
return set(deletes + transposes + replaces + inserts)
|
||||
|
||||
|
||||
def edits2(word):
|
||||
"All edits that are two edits away from `word`."
|
||||
return (e2 for e1 in edits1(word) for e2 in edits1(e1))
|
||||
|
||||
|
||||
def autocorrectWord(word):
|
||||
try:
|
||||
return correction(word)
|
||||
except:
|
||||
return word
|
||||
|
||||
|
||||
############# stringcleaning
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def clean(stringstream,autocorrect=False):
|
||||
|
||||
for string in stringstream:
|
||||
# fixUnicode
|
||||
string = textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC')
|
||||
|
||||
# seperate_words_on_regex:
|
||||
string = " ".join(re.compile(REGEX_SPECIALCHAR).split(string)) #frage ,.?!
|
||||
|
||||
#normalize whitespace
|
||||
string = textacy.preprocess.normalize_whitespace(string)
|
||||
|
||||
#remove linebreaks
|
||||
string = re.sub(r'[\n]', " ", string)
|
||||
|
||||
# replaceRockDots
|
||||
string = re.sub(r'[ß]', "ss", string)
|
||||
string = re.sub(r'[ö]', "oe", string)
|
||||
string = re.sub(r'[ü]', "ue", string)
|
||||
string = re.sub(r'[ä]', "ae", string)
|
||||
|
||||
# frage autocorrect?
|
||||
#idee http://lexitron.nectec.or.th/public/COLING-2010_Beijing_China/POSTERS/pdf/POSTERS022.pdf
|
||||
if autocorrect:
|
||||
string = " ".join([autocorrectWord(word) for word in string.split()])
|
||||
|
||||
yield string
|
||||
|
||||
|
||||
|
||||
def processDictstream(dictstream, funcdict, parser):
|
||||
"""
|
||||
|
||||
:param dictstream: dict-gen
|
||||
:param funcdict:
|
||||
clean_in_meta = {
|
||||
"Solution":funclist,
|
||||
...
|
||||
}
|
||||
|
||||
:param parser: spacy-parser
|
||||
:return: dict-gen
|
||||
"""
|
||||
for dic in dictstream:
|
||||
result = {}
|
||||
for key, value in dic.items():
|
||||
|
||||
if key in funcdict:
|
||||
|
||||
doc = parser(value)
|
||||
tokens = [tok for tok in doc]
|
||||
funclist = funcdict[key]
|
||||
|
||||
tokens = filterTokens(tokens, funclist)
|
||||
|
||||
result[key] = " ".join([tok.lower_ for tok in tokens])
|
||||
|
||||
|
||||
else:
|
||||
result[key] = value
|
||||
yield result
|
||||
|
||||
def filterTokens(tokens, funclist):
|
||||
# in:tokenlist, funclist
|
||||
# out: tokenlist
|
||||
for f in funclist:
|
||||
tokens = list(filter(f, tokens))
|
||||
|
||||
return tokens
|
||||
|
||||
def removePOS(pos_list):
|
||||
return lambda tok: tok.pos_ not in pos_list
|
||||
|
||||
##################################################################################################
|
||||
|
||||
|
||||
path2wordsdict = FILEPATH + config.get("spellchecking", "pickle_file")
|
||||
|
||||
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
|
||||
|
||||
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
|
||||
|
||||
|
||||
|
||||
|
||||
def cleanCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", printrandom=10):
|
||||
|
||||
logprint("Clean {0}_corpus at {1}".format(lang, datetime.now()))
|
||||
|
||||
rawCorpus_name = lang + "_raw_ticket"
|
||||
cleanCorpus_name = lang + "_clean_ticket"
|
||||
|
||||
#load raw corpus and create new one
|
||||
raw_corpus, parser = load_corpus(corpus_name=rawCorpus_name, corpus_path=corpus_path)
|
||||
|
||||
clean_corpus = textacy.Corpus(parser)
|
||||
|
||||
|
||||
## process and add files to textacy-corpi,
|
||||
clean_corpus.add_texts(
|
||||
clean(corpus2Text(raw_corpus)),
|
||||
processDictstream(corpus2Meta(raw_corpus), clean_in_meta,parser=parser)
|
||||
)
|
||||
|
||||
|
||||
# leere docs aus corpi kicken
|
||||
clean_corpus.remove(lambda doc: len(doc) == 0)
|
||||
|
||||
|
||||
for i in range(printrandom):
|
||||
printRandomDoc(clean_corpus)
|
||||
|
||||
|
||||
|
||||
#save corpus
|
||||
save_corpus(corpus=clean_corpus, corpus_path=corpus_path, corpus_name=cleanCorpus_name)
|
||||
|
||||
|
||||
|
||||
return clean_corpus
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
start = time.time()
|
||||
|
||||
WORDS = load_obj(path2wordsdict)
|
||||
|
||||
clean_in_content = [] #frage notwendig?
|
||||
|
||||
|
||||
clean_in_meta = {
|
||||
"Solution": [removePOS(["SPACE"])],
|
||||
"Subject": [removePOS(["SPACE", "PUNCT"])],
|
||||
"categoryName": [removePOS(["SPACE", "PUNCT"])]
|
||||
}
|
||||
|
||||
corpus = cleanCorpus(corpus_de_path, clean_in_content, clean_in_meta, "de",printrandom=5 )
|
||||
|
||||
end = time.time()
|
||||
logprint("Time Elapsed Cleaning:{0} min".format((end - start) / 60))
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,24 @@
|
|||
Index: 0
|
||||
Text: lieber support, ich habe gerade versucht mich mit meiner unicard im firefox browser fuer das service portal zu authentifizieren. das hat vor einigen wochen noch tadelos geklappt und mittlerweile bekomme ich folgende fehlermeldung ich hoffe sie koennen mir weiterhelfen. vielen dank und viele gruesse sascha feldhorst dipl. inform. sascha feldhorst wiss. ang. technische universitaet dortmund maschinenbau lehrstuhl fuer foerder und lagerwesen logistikcampus joseph von fraunhofer str. 2 4 d 44227 dortmund tel. 49 231 755 40 73 fax 49 231 755 47 68 mailto sascha.feldhorst@tu dortmund.de sascha.feldhorst@tu dortmund.de http www.flw.mb.tu dortmund.de www.flw.mb.tu dortmund.de wichtiger hinweis die information in dieser e mail ist vertraulich. sie ist ausschliesslich fuer den adressaten bestimmt. sollten sie nicht der fuer diese e mail bestimmte adressat sein, unterrichten sie bitte den absender und vernichten sie diese mail. vielen dank. unbeschadet der korrespondenz per e mail, sind unsere erklaerungen ausschliesslich final rechtsverbindlich, wenn sie in herkoemmlicher schriftform mit eigenhaendiger unterschrift oder durch uebermittlung eines solchen schriftstuecks per telefax erfolgen. important note the information included in this e mail is confidential. it is solely intended for the recipient. if you are not the intended recipient of this e mail please contact the sender and delete this message. thank you. without prejudice of e mail correspondence, our statements are only legally binding when they are made in the conventional written form with personal signature or when such documents are sent by fax.
|
||||
categoryName: betrieb
|
||||
|
||||
Index: 0
|
||||
Text: support browser service portal mittlerweile
|
||||
categoryName: betrieb
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Index: 1
|
||||
Text: telefon umzug antragsteller astrid gramm astrid.gramm@tu dortmund.de terminvorschlag 14.08.2015 einrichtung dezernat 2 abteilung 2.5 psp element uniaccount mnichofm hofmann, nicole gebaeude dezernat 5 raum id 201 651430 telefondose neztwerkdose dt04.5 04.6 telefonnr. 4821 eintrag telefonbuch e mail astrid.gramm@tu dortmund.de voicemail ansprechpartner astrid gramm tel. ansprechpartner 5444 verantwortlicher nutzer type bemerkung frau hofmann wird am 14.08.2015 in die wd 2 umziehen. es ist der raum 201a im og nicht 201 eine bezeichnung der telefondose ist nicht vorhanden.
|
||||
categoryName: elektronisches telefonbuch
|
||||
|
||||
Index: 1
|
||||
Text: telefon umzug antragsteller gramm einrichtung dezernat abteilung element gebaeude dezernat raum id eintrag telefonbuch mail ansprechpartner gramm ansprechpartner verantwortlicher nutzer type bemerkung raum bezeichnung
|
||||
categoryName: elektronisches telefonbuch
|
|
@ -41,10 +41,8 @@ filename=topicModelTickets.log
|
|||
|
||||
|
||||
[de_corpus]
|
||||
#input=M42-Export/Tickets_med.csv
|
||||
#input=M42-Export/Tickets_small.csv
|
||||
#input=M42-Export/Tickets_mini.csv
|
||||
input=M42-Export/de_tickets.csv
|
||||
input=M42-Export/Tickets_small.csv
|
||||
#input=M42-Export/de_tickets.csv
|
||||
|
||||
path=corpi/
|
||||
|
||||
|
|
|
@ -97,6 +97,7 @@ metaliste = [
|
|||
content_collumn_name = config.get("tickets","content_collumn_name")
|
||||
metaliste = list(map(normalize_whitespace,config.get("tickets","metaliste").split(",")))
|
||||
|
||||
|
||||
path2de_csv = FILEPATH + config.get("de_corpus","input")
|
||||
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
|
||||
|
||||
|
@ -114,7 +115,7 @@ def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, la
|
|||
path_csv_split = path2_csv.split("/")
|
||||
filename = path_csv_split[len(path_csv_split) - 1]
|
||||
|
||||
printlog("Corporization of {0} at {1}".format(filename,datetime.now()))
|
||||
logprint("Corporization of {0} at {1}".format(filename, datetime.now()))
|
||||
|
||||
|
||||
raw_corpus = textacy.Corpus(lang)
|
||||
|
@ -139,7 +140,7 @@ def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, la
|
|||
# save corpus
|
||||
raw_name = lang + "_raw_ticket"
|
||||
save_corpus(corpus=raw_corpus, corpus_path=corpus_path, corpus_name=raw_name)
|
||||
printlog("Done")
|
||||
logprint("Done")
|
||||
|
||||
|
||||
def main():
|
||||
|
@ -152,7 +153,7 @@ def main():
|
|||
|
||||
|
||||
end = time.time()
|
||||
printlog("Time Elapsed Corporization:{0} min".format((end - start) / 60))
|
||||
logprint("Time Elapsed Corporization:{0} min".format((end - start) / 60))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
16
init.py
16
init.py
|
@ -272,47 +272,47 @@ path2stopwordlist_en = FILEPATH + config.get("en_stopwords","pickle_file")
|
|||
|
||||
def main():
|
||||
start = time.time()
|
||||
printlog("Init: {0}".format(datetime.now()))
|
||||
logprint("Init: {0}".format(datetime.now()))
|
||||
|
||||
|
||||
|
||||
printlog("create and save lemma_dict")
|
||||
logprint("create and save lemma_dict")
|
||||
lemma_dict = create_lemma_dict(path2lemma_file)
|
||||
save_obj(lemma_dict, path2lemmadict)
|
||||
|
||||
|
||||
printlog("Build and save Wordlist for Spellchecking")
|
||||
logprint("Build and save Wordlist for Spellchecking")
|
||||
words = build_words_for_spellchecking(path2words_file)
|
||||
save_obj(words, path2wordlist)
|
||||
|
||||
|
||||
|
||||
printlog("Build and save Thesaurus")
|
||||
logprint("Build and save Thesaurus")
|
||||
thesaurus = build_thesaurus_dict(path2wordnet)
|
||||
save_obj(thesaurus, path2thesaurus_dict)
|
||||
|
||||
|
||||
|
||||
|
||||
printlog("Build and save stoppwortliste")
|
||||
logprint("Build and save stoppwortliste")
|
||||
de_stop_words, en_stop_words = create_stopword_lists(stop1, stop2, stop3)
|
||||
save_obj(de_stop_words, path2stopwordlist_de)
|
||||
save_obj(en_stop_words, path2stopwordlist_en)
|
||||
|
||||
|
||||
|
||||
printlog("Build and save nomenliste")
|
||||
logprint("Build and save nomenliste")
|
||||
nouns = list_from_files(nouns1,nouns2)
|
||||
save_obj(nouns, path2nouns_list)
|
||||
|
||||
|
||||
printlog("Build and save firstnameslist")
|
||||
logprint("Build and save firstnameslist")
|
||||
vornamen = list_from_files(firstnames_txt)
|
||||
save_obj(vornamen, path2firstnameslist)
|
||||
|
||||
|
||||
end = time.time()
|
||||
printlog("Time Elapsed Initialization:{0} min".format((end - start) / 60))
|
||||
logprint("Time Elapsed Initialization:{0} min".format((end - start) / 60))
|
||||
|
||||
|
||||
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
20
main.py
20
main.py
|
@ -1,29 +1,35 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import time
|
||||
import init
|
||||
import corporization
|
||||
import preprocessing
|
||||
import topicModeling
|
||||
import cleaning
|
||||
from miscellaneous import *
|
||||
|
||||
|
||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/main.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_main.log &"
|
||||
start = time.time()
|
||||
|
||||
import init
|
||||
init.main()
|
||||
printlog("")
|
||||
logprint("")
|
||||
|
||||
corporization.main()
|
||||
printlog("")
|
||||
logprint("")
|
||||
|
||||
cleaning.main()
|
||||
logprint("")
|
||||
|
||||
preprocessing.main()
|
||||
printlog("")
|
||||
logprint("")
|
||||
|
||||
topicModeling.main()
|
||||
printlog("")
|
||||
topicModeling.main(use_raw=False)
|
||||
logprint("")
|
||||
topicModeling.main(use_raw=True)
|
||||
logprint("")
|
||||
|
||||
end = time.time()
|
||||
printlog("Total Time Elapsed: {0} min".format((end - start) / 60))
|
||||
logprint("Total Time Elapsed: {0} min".format((end - start) / 60))
|
||||
|
||||
|
||||
|
|
150
miscellaneous.py
150
miscellaneous.py
|
@ -12,6 +12,10 @@ import spacy
|
|||
import textacy
|
||||
from scipy import *
|
||||
import os
|
||||
import glob, os
|
||||
from textacy.fileio import open_sesame
|
||||
import json
|
||||
from spacy.tokens.doc import Doc as SpacyDoc
|
||||
|
||||
csv.field_size_limit(sys.maxsize)
|
||||
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
|
||||
|
@ -40,7 +44,7 @@ logging.basicConfig(filename=filename, level=level)
|
|||
|
||||
|
||||
|
||||
def printlog(string, level="INFO"):
|
||||
def logprint(string, level="INFO"):
|
||||
"""log and prints"""
|
||||
print(string)
|
||||
if level == "INFO":
|
||||
|
@ -91,6 +95,7 @@ def load_obj(path):
|
|||
with open(path, 'rb') as f:
|
||||
return pickle.load(f)
|
||||
|
||||
|
||||
def replaceRockDots():
|
||||
return lambda string: re.sub(r'[ß]', "ss",
|
||||
(re.sub(r'[ö]', "oe",
|
||||
|
@ -117,7 +122,19 @@ def list_from_files(*paths):
|
|||
return list(map(textacy.preprocess.normalize_whitespace, liste))
|
||||
|
||||
|
||||
def deprecated(func):
|
||||
"""This is a decorator which can be used to mark functions
|
||||
as deprecated. It will result in a warning being emmitted
|
||||
when the function is used."""
|
||||
|
||||
@functools.wraps(func)
|
||||
def new_func(*args, **kwargs):
|
||||
warnings.simplefilter('always', DeprecationWarning) #turn off filter
|
||||
warnings.warn("Call to deprecated function {}.".format(func.__name__), category=DeprecationWarning, stacklevel=2)
|
||||
warnings.simplefilter('default', DeprecationWarning) #reset filter
|
||||
return func(*args, **kwargs)
|
||||
|
||||
return new_func
|
||||
|
||||
|
||||
def printRandomDoc(textacyCorpus):
|
||||
|
@ -127,17 +144,26 @@ def printRandomDoc(textacyCorpus):
|
|||
"""
|
||||
print()
|
||||
if len(textacyCorpus) == 0:
|
||||
printlog("NO DOCS IN CORPUS")
|
||||
logprint("NO DOCS IN CORPUS")
|
||||
else:
|
||||
printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
|
||||
#printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
|
||||
randIndex = int((len(textacyCorpus) - 1) * random.random())
|
||||
printlog("Index: {0} \n Text: {1} \n Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text,
|
||||
textacyCorpus[randIndex].metadata['categoryName']))
|
||||
logprint("Index: {0} \n Text: {1} \n categoryName: {2}\n".format(randIndex, textacyCorpus[randIndex].text,
|
||||
textacyCorpus[randIndex].metadata['categoryName']))
|
||||
|
||||
print()
|
||||
|
||||
|
||||
def corpus2Text(corpus):
|
||||
for doc in corpus:
|
||||
yield doc.text
|
||||
|
||||
def corpus2Meta(corpus):
|
||||
for doc in corpus:
|
||||
yield doc.metadata
|
||||
|
||||
def saveplaincorpustext(corpus,path):
|
||||
textacy.fileio.write_file_lines(corpus2Text(corpus),filepath=path )
|
||||
|
||||
|
||||
|
||||
|
@ -163,10 +189,16 @@ def save_corpus(corpus, corpus_path, corpus_name):
|
|||
parserpath = corpus_path + str(parser.lang) + '_parser'
|
||||
parser.save_to_directory(parserpath)
|
||||
|
||||
##
|
||||
|
||||
# save content
|
||||
contentpath = corpus_path + corpus_name + "_content.bin"
|
||||
textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus), contentpath)
|
||||
|
||||
#save plain content
|
||||
plainpath = corpus_path + corpus_name + "_content.json"
|
||||
textacy.fileio.write_json_lines(({"index" : doc.corpus_index, "content" : doc.text} for doc in corpus), plainpath)
|
||||
|
||||
# save meta
|
||||
metapath = corpus_path + corpus_name + "_meta.json"
|
||||
textacy.fileio.write_json_lines((doc.metadata for doc in corpus), metapath)
|
||||
|
@ -175,6 +207,7 @@ def save_corpus(corpus, corpus_path, corpus_name):
|
|||
|
||||
|
||||
|
||||
|
||||
def load_corpus(corpus_path, corpus_name, lang="de"):
|
||||
"""
|
||||
Load textacy-Corpus including spacy-parser out from file
|
||||
|
@ -207,16 +240,115 @@ def load_corpus(corpus_path, corpus_name, lang="de"):
|
|||
|
||||
|
||||
contentpath = corpus_path + corpus_name + "_content.bin"
|
||||
plainpath = corpus_path + corpus_name + "_content.json"
|
||||
metapath = corpus_path + corpus_name + "_meta.json"
|
||||
|
||||
try:
|
||||
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
|
||||
metadata_stream = textacy.fileio.read_json_lines(metapath)
|
||||
|
||||
metadata_stream = textacy.fileio.read_json_lines(metapath)
|
||||
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
|
||||
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
|
||||
corpus.add_doc(
|
||||
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
|
||||
corpus.add_doc(
|
||||
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
|
||||
except:
|
||||
# neu init!!
|
||||
corpus = textacy.Corpus(parser)
|
||||
|
||||
plain_stream = textacy.fileio.read_json_lines(plainpath) # yields {int : str}
|
||||
metadata_stream = textacy.fileio.read_json_lines(metapath)
|
||||
|
||||
for plain, metadata in zip(plain_stream, metadata_stream):
|
||||
corpus.add_doc(
|
||||
textacy.Doc(plain["content"], lang=corpus.spacy_lang, metadata=metadata))
|
||||
|
||||
|
||||
return corpus, corpus.spacy_lang
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def save_corpusV2(corpus, corpus_path, corpus_name):
|
||||
"""
|
||||
saves a textacy-corpus including spacy-parser
|
||||
:param corpus: textacy-Corpus
|
||||
:param corpus_path: str
|
||||
:param corpus_name: str (should content the language like "_de_")
|
||||
"""
|
||||
|
||||
# save parser
|
||||
parser = corpus.spacy_lang
|
||||
parserpath = corpus_path + str(parser.lang) + '_parser'
|
||||
parser.save_to_directory(parserpath)
|
||||
|
||||
|
||||
contentpath = corpus_path +corpus_name + "_docs/"
|
||||
if not os.path.exists(contentpath):
|
||||
os.makedirs(contentpath)
|
||||
|
||||
for doc in corpus:
|
||||
with open(contentpath + str(doc.corpus_index) + "_doc.bin", 'w') as f:
|
||||
f.write(doc.spacy_doc.to_bytes())
|
||||
with open(contentpath + str(doc.corpus_index) + "_meta.json", 'w') as file:
|
||||
file.write(json.dumps(doc.metadata))
|
||||
|
||||
def load_corpusV2(corpus_path, corpus_name, lang="de"):
|
||||
"""
|
||||
Load textacy-Corpus including spacy-parser out from file
|
||||
:param corpus_path: str
|
||||
:param corpus_name: str (should content the language like "_de_")
|
||||
:param lang: str (language code) ir spacy.Language
|
||||
:return: texracy.Corpus, spacy.language
|
||||
"""
|
||||
|
||||
# ckeck for language
|
||||
if "de_" in corpus_name:
|
||||
lang = "de"
|
||||
elif "en_" in corpus_name:
|
||||
lang = "en"
|
||||
|
||||
# load parser
|
||||
parser = spacy.load(lang)
|
||||
|
||||
stringstorepath = corpus_path + str(lang) + '_parser' + '/vocab/strings.json'
|
||||
with open(stringstorepath) as file:
|
||||
parser.vocab.strings.load(file)
|
||||
|
||||
vocabpath = Path(corpus_path + str(lang) + '_parser' + '/vocab/lexemes.bin')
|
||||
parser.vocab.load_lexemes(vocabpath)
|
||||
|
||||
# load corpus
|
||||
corpus = textacy.Corpus(parser)
|
||||
|
||||
contentpath = corpus_path + corpus_name + "_docs/"
|
||||
docs = yield_fromdir(contentpath,spacy_vocab=corpus.spacy_vocab,type="doc")
|
||||
metas = yield_fromdir(contentpath,type="meta")
|
||||
|
||||
for doc,meta in zip(docs,metas):
|
||||
corpus.add_doc(
|
||||
textacy.Doc(doc, lang=corpus.spacy_lang, metadata=meta))
|
||||
|
||||
|
||||
return corpus, corpus.spacy_lang
|
||||
|
||||
def yield_fromdir(path,spacy_vocab=None,type=".pkl"):
|
||||
os.chdir(path)
|
||||
filelist = [name for name in os.listdir('.') if os.path.isfile(name)]
|
||||
filelist = [filename for filename in filelist if type in filename]
|
||||
filelist.sort(key = lambda elem : elem.split("_")[0])
|
||||
|
||||
|
||||
if type =='doc':
|
||||
for filename in filelist:
|
||||
with open(path+filename,'r') as f:
|
||||
for bytes_string in SpacyDoc.read_bytes(f):
|
||||
yield SpacyDoc(spacy_vocab).from_bytes(bytes_string)
|
||||
elif type == 'meta':
|
||||
for filename in filelist:
|
||||
with open(path+filename,'r') as f:
|
||||
yield json.load(f)
|
||||
else:
|
||||
for filename in filelist:
|
||||
yield load_obj(path+filename)
|
||||
|
||||
|
||||
|
|
|
@ -27,12 +27,6 @@ with open(config_ini) as f:
|
|||
global REGEX_SPECIALCHAR
|
||||
global REGEX_TOPLVL
|
||||
|
||||
REGEX_SPECIALCHAR = r'[`\-=~%^&*()_+\[\]{};\'\\:"|</>]'
|
||||
REGEX_TOPLVL = r'\.[a-z]{2,3}(\.[a-z]{2,3})?'
|
||||
|
||||
|
||||
|
||||
|
||||
global THESAURUS
|
||||
global WORDS
|
||||
global LEMMAS
|
||||
|
@ -41,6 +35,10 @@ global VORNAMEN
|
|||
global DE_STOP_WORDS
|
||||
global EN_STOP_WORDS
|
||||
|
||||
REGEX_SPECIALCHAR = r'[`\-=~%^&*()_+\[\]{};\'\\:"|</>]' #+r',.'
|
||||
REGEX_TOPLVL = r'\.[a-z]{2,3}(\.[a-z]{2,3})?'
|
||||
|
||||
|
||||
THESAURUS = {}
|
||||
WORDS= {}
|
||||
LEMMAS= {}
|
||||
|
@ -185,7 +183,7 @@ def autocorrectWord(word):
|
|||
|
||||
|
||||
############# stringcleaning
|
||||
|
||||
@deprecated
|
||||
def stringcleaning(stringstream):
|
||||
|
||||
|
||||
|
@ -225,7 +223,6 @@ def stringcleaning(stringstream):
|
|||
|
||||
|
||||
|
||||
|
||||
def filterTokens(tokens, funclist):
|
||||
# in:tokenlist, funclist
|
||||
# out: tokenlist
|
||||
|
@ -257,20 +254,6 @@ def processContentstream2(textstream, parser, token_filterlist=None):
|
|||
def preparse(stringstream):
|
||||
|
||||
for string in stringstream:
|
||||
# fixUnicode
|
||||
string = textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC')
|
||||
|
||||
# seperate_words_on_regex:
|
||||
string = " ".join(re.compile(REGEX_SPECIALCHAR).split(string))
|
||||
|
||||
#normalize whitespace
|
||||
string = textacy.preprocess.normalize_whitespace(string)
|
||||
|
||||
# replaceRockDots
|
||||
string = re.sub(r'[ß]', "ss", string)
|
||||
string = re.sub(r'[ö]', "oe", string)
|
||||
string = re.sub(r'[ü]', "ue", string)
|
||||
string = re.sub(r'[ä]', "ae", string)
|
||||
|
||||
# cut_after
|
||||
# todo addressen enfernen --> postal.parser idee zu metadaten hinzufügen
|
||||
|
@ -312,6 +295,7 @@ def corpus2Meta(corpus):
|
|||
for doc in corpus:
|
||||
yield doc.metadata
|
||||
|
||||
@deprecated
|
||||
def processContentstream(textstream, parser, token_filterlist=None):
|
||||
"""
|
||||
:param textstream: string-gen
|
||||
|
@ -398,21 +382,22 @@ corpus_en_path = FILEPATH + config.get("en_corpus", "path")
|
|||
|
||||
def preprocessCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", printrandom=10):
|
||||
|
||||
printlog("Preprocess {0}_corpus at {1}".format(lang,datetime.now()))
|
||||
logprint("Preprocess {0}_corpus at {1}".format(lang, datetime.now()))
|
||||
|
||||
rawCorpus_name = lang + "_raw_ticket"
|
||||
cleanCorpus_name = lang + "_clean_ticket"
|
||||
preCorpus_name = lang + "_pre_ticket"
|
||||
|
||||
logprint("Load {0}_raw".format(lang))
|
||||
#load raw corpus and create new one
|
||||
raw_corpus, parser = load_corpus(corpus_name=rawCorpus_name, corpus_path=corpus_path)
|
||||
clean_corpus, parser = load_corpus(corpus_name=cleanCorpus_name, corpus_path=corpus_path)
|
||||
|
||||
corpus = textacy.Corpus(parser)
|
||||
|
||||
|
||||
## process and add files to textacy-corpi,
|
||||
corpus.add_texts(
|
||||
processContentstream2(corpus2Text(raw_corpus), token_filterlist=filter_tokens, parser=parser),
|
||||
processDictstream(corpus2Meta(raw_corpus), clean_in_meta,parser=parser)
|
||||
processContentstream2(corpus2Text(clean_corpus), token_filterlist=filter_tokens, parser=parser),
|
||||
processDictstream(corpus2Meta(clean_corpus), clean_in_meta,parser=parser)
|
||||
)
|
||||
|
||||
|
||||
|
@ -429,6 +414,16 @@ def preprocessCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", print
|
|||
save_corpus(corpus=corpus, corpus_path=corpus_path, corpus_name=preCorpus_name)
|
||||
|
||||
|
||||
#save corpus as labled, plain text
|
||||
plainpath = FILEPATH + config.get("de_corpus", "path") + "labled_lines.txt"
|
||||
textacy.fileio.write_file_lines(labledCorpiLines(corpus),filepath=plainpath )
|
||||
|
||||
return corpus
|
||||
|
||||
def labledCorpiLines(corpus):
|
||||
for doc in corpus:
|
||||
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
|
||||
yield "[" + doc.metadata["categoryName"] + "] " + doc.text
|
||||
|
||||
|
||||
def main():
|
||||
|
@ -468,12 +463,16 @@ def main():
|
|||
"categoryName": [removePOS(["SPACE", "PUNCT"])]
|
||||
}
|
||||
|
||||
preprocessCorpus(corpus_de_path, filter_tokens, clean_in_meta, "de" )
|
||||
corpus = preprocessCorpus(corpus_de_path, filter_tokens, clean_in_meta, "de",printrandom=5)
|
||||
|
||||
#from topicModeling import jgibbsLLDA
|
||||
|
||||
#jgibbsLLDA(corpus)
|
||||
|
||||
#preprocessCorpus(corpus_en_path, filter_tokens, clean_in_meta, "en" )
|
||||
|
||||
end = time.time()
|
||||
printlog("Time Elapsed Preprocessing:{0} min".format((end - start) / 60))
|
||||
logprint("Time Elapsed Preprocessing:{0} min".format((end - start) / 60))
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
81
testra.py
81
testra.py
|
@ -15,11 +15,26 @@ start = time.time()
|
|||
import enchant
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
import os
|
||||
import xml.etree.ElementTree as ET
|
||||
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
|
||||
from miscellaneous import *
|
||||
|
||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModeling.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_topicModeling.log &"
|
||||
|
||||
parser = spacy.load("de")
|
||||
|
||||
|
||||
|
||||
"""
|
||||
# load config
|
||||
config_ini = FILEPATH + "config.ini"
|
||||
|
||||
config = ConfigParser.ConfigParser()
|
||||
with open(config_ini) as f:
|
||||
config.read_file(f)
|
||||
|
||||
|
||||
PARSER=spacy.load("de")
|
||||
|
||||
|
||||
|
@ -48,13 +63,74 @@ def makemeta( testmetda):
|
|||
yield metdata
|
||||
|
||||
|
||||
def corpus2Text(corpus):
|
||||
for doc in corpus:
|
||||
yield doc.text
|
||||
|
||||
corpi.add_texts(
|
||||
makecontent(testcontetn),
|
||||
makemeta(testmetda)
|
||||
)
|
||||
|
||||
corpus_de_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/test/"
|
||||
rawCorpus_name = "de_test_ticket"
|
||||
print(corpi)
|
||||
|
||||
#save_corpusV2(corpi,corpus_path=corpus_de_path,corpus_name=rawCorpus_name)
|
||||
|
||||
#textacy.fileio.write_file_lines(corpus2Text(corpi), filepath=corpus_de_path+"plain.txt")
|
||||
|
||||
|
||||
dict = {"unicard redaktionsteam": 189, "kms": 131, "itmc_st\u00f6rungen": 17, "benutzerverwaltung_probleme": 168, "mailverteiler exchange": 130, "beamer": 70, "cws_confluence": 190, "benutzerverwaltung": 26, "sos": 166, "virtuelle server": 116, "sap": 7, "wlan": 21, "lsf": 6, "gastaufenthalt": 8, "umzug": 5, "firewall betreuung": 129, "ausleihe": 39, "fiona": 10, "kursplanung": 195, "schulungsraum verwaltung": 200, "plagiatserkennung": 32, "designentwicklung": 100, "ub basis it": 184, "tsm": 51, "backup tsm": 110, "raumkalender": 174, "veeam": 149, "linux bs": 42, "hochleistungsrechnen": 90, "e learning": 37, "h\u00f6rsaal\u00fcbertragung": 52, "sophos": 88, "service portal redaktion": 182, "verkauf": 93, "fk 16": 30, "campus app": 54, "dns": 71, "kurse": 196, "itmc schulungsr\u00e4ume": 96, "leitung": 91, "telefon": 14, "housing": 135, "softwarelizenzen": 35, "hcm stammdaten": 68, "semesterticket": 197, "exchange nutzung": 33, "mediendienste": 167, "sam spider": 172, "pvp": 27, "webserver": 29, "werkvertr\u00e4ge": 158, "ibz raumbuchung": 177, "webmailer": 126, "unicard sperrung": 64, "cd dvd produktion": 114, "lizenzserver": 92, "pr\u00fcfungsmanagement": 38, "blogs wikis foren": 87, "unicard ausgabe": 161, "pools": 157, "desktop & basisdienste": 144, "antrag auf rechnungserstellung": 193, "mailalias": 121, "evaexam": 133, "neuanschluss": 0, "mobilfunkvertr\u00e4ge": 69, "ftp server": 191, "haustechnik": 77, "raumbuchungssysteme": 186, "confluence": 181, "uniaccount zugangsdaten": 47, "itmc medienr\u00e4ume ef50": 171, "dokoll support": 128, "elektronisches telefonbuch": 3, "softwareverteilung": 153, "overhead projektor": 104, "sicherheit": 145, "itmc_als": 48, "itmc pools": 160, "zhb": 60, "serversupport": 101, "veranstaltungen": 61, "fk12 webauftritt": 138, "hardware": 142, "unicard produktion": 156, "telefonkonferenzen": 170, "dhcp": 188, "zertifikate server dfn": 139, "lan": 1, "datanet": 49, "neuausstattung": 173, "moodle": 16, "abmeldung": 13, "uni mail": 15, "medienr\u00e4ume ef50": 117, "verschiedene aufgaben": 40, "zentrale webserver": 75, "vorlesungsaufzeichnung": 152, "grafik": 132, "campus management": 72, "hacker angriff": 46, "pos": 23, "zugangsdaten": 41, "serviceportal": 63, "ews": 24, "voicemail box": 150, "service desk itmc": 74, "test": 180, "beschaffung": 57, "bestellung": 185, "vpn": 55, "app feedback": 66, "allgemein": 134, "rundmail": 105, "telefonabrechnung": 199, "limesurvey": 31, "unicard": 28, "eldorado": 140, "uniaccount": 12, "plotter": 125, "mdm mobile device management": 120, "namens\u00e4nderung": 43, "sd": 84, "basis applikationen": 103, "\u00e4nderung": 194, "fileserver einrichtung": 187, "fk14_test": 154, "werkst\u00e4tte": 179, "itmc_aufgaben": 45, "formulare antr\u00e4ge": 81, "facility": 192, "web": 169, "asknet": 136, "server storage": 113, "mail groupware": 20, "rektorat -b\u00fcro": 178, "office": 50, "werkstoffe lehrstuhl bauwesen": 59, "telefonzentrale": 115, "verwaltung": 4, "netze": 22, "beantragung": 82, "d.3 dms": 148, "redmine projektverwaltung": 141, "wsus": 106, "lido": 118, "rechnerr\u00e4ume": 143, "matrix42_hilfe": 18, "boss service desk": 44, "konteneinsicht": 62, "spam phishing": 53, "forensic": 164, "fk 12": 11, "benutzungsverwaltung": 198, "redmine": 79, "basis app": 85, "viren": 95, "fk12 migration": 155, "raumbuchung": 109, "virtuelle desktops citrix": 176, "outlook_einrichtung": 123, "kundenserver": 137, "nrw ticket": 80, "weiterentwicklung": 127, "siport zugangskontrolle": 98, "e mail dienste": 99, "vorlagenerstellung": 36, "video": 19, "studierendensekretariat": 111, "it sicherheit sic": 86, "boss": 25, "technik": 58, "dokoll pvp": 112, "betrieb": 2, "v2 campus app feedback": 151, "mailverteiler": 108, "videoschnitt": 119, "fk raumplanung 09": 9, "sap urlaub": 73, "keine r\u00fcckantwort": 124, "prozess- und projektmanagement": 67, "dienstreise": 34, "webgestaltung": 78, "schulung": 175, "software": 89, "medientechnik": 76, "servicedesk": 107, "service portal": 94, "software entwicklung": 165, "uniflow": 159, "ub_st\u00f6rungen": 162, "fk15": 183, "uhren": 83, "entwicklung": 163, "videokonferenzen": 97, "itmc webauftritt": 102, "joomla itmc website": 147, "changes": 122, "visitenkartenproduktion": 65, "lizenzmanagement": 146, "tonerb\u00f6rse": 201, "arbeitsplatzsupport": 56}
|
||||
|
||||
list = [(key,value) for key,value in dict.items()]
|
||||
|
||||
list.sort(key=lambda tup : tup[1])
|
||||
"""
|
||||
"""
|
||||
from spacy.tokens.doc import Doc as SpacyDoc
|
||||
|
||||
filepath = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/de_clean_ticket_content.bin"
|
||||
|
||||
# load parser
|
||||
parser = spacy.load("de")
|
||||
|
||||
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/"
|
||||
|
||||
stringstorepath = corpus_path + 'de_parser/vocab/strings.json'
|
||||
with open(stringstorepath) as file:
|
||||
parser.vocab.strings.load(file)
|
||||
|
||||
vocabpath = Path(corpus_path + 'de_parser/vocab/lexemes.bin')
|
||||
parser.vocab.load_lexemes(vocabpath)
|
||||
|
||||
spacy_vocab = parser.vocab
|
||||
|
||||
def readCorpus(filepath):
|
||||
with open_sesame(filepath, mode='rb') as f:
|
||||
for bytes_string in SpacyDoc.read_bytes(f):
|
||||
yield SpacyDoc(spacy_vocab).from_bytes(bytes_string).text
|
||||
|
||||
|
||||
textacy.fileio.write_file_lines(readCorpus(filepath),"/home/jannis.grundmann/PycharmProjects/topicModelingTickets/result.txt")
|
||||
"""
|
||||
|
||||
|
||||
|
||||
# load raw corpus and create new one
|
||||
#raw_corpus, parser = load_corpusV2(corpus_name=rawCorpus_name, corpus_path=corpus_de_path)
|
||||
|
||||
#printRandomDoc(raw_corpus)
|
||||
|
||||
|
||||
"""
|
||||
spacy_doc = PARSER("test")
|
||||
save_obj(spacy_doc, "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/doc.pkl")
|
||||
|
||||
spacy_doc2 = load_obj("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/doc.pkl")
|
||||
|
||||
print("Doc: {0}".format(spacy_doc2))
|
||||
|
||||
|
||||
|
||||
jgibbsLLDA_root = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/"
|
||||
|
||||
|
@ -63,6 +139,7 @@ laveldict = {'fiona': 10, 'vorlagenerstellung': 36, 'webserver': 29, 'matrix42_h
|
|||
with open(LLDA_filepath, 'w') as file:
|
||||
file.write(json.dumps(laveldict))
|
||||
"""
|
||||
"""
|
||||
def load_corpus(corpus_path, corpus_name, lang="de"):
|
||||
from pathlib import Path
|
||||
|
||||
|
|
203
topicModeling.py
203
topicModeling.py
|
@ -31,13 +31,21 @@ with open(config_ini) as f:
|
|||
config.read_file(f)
|
||||
|
||||
|
||||
def label2ID(label, labeldict):
|
||||
return labeldict.get(label, len(labeldict))
|
||||
|
||||
|
||||
def printvecotorization(de_corpus,ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_entities=True):
|
||||
printlog(str("ngrams: {0}".format(ngrams)))
|
||||
printlog(str("min_df: {0}".format(min_df)))
|
||||
printlog(str("max_df: {0}".format(max_df)))
|
||||
printlog(str("named_entities: {0}".format(named_entities)))
|
||||
def generate_labled_lines(textacyCorpus, labeldict):
|
||||
for doc in textacyCorpus:
|
||||
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
|
||||
yield "[" + str(label2ID(doc.metadata["categoryName"], labeldict)) + "] " + doc.text
|
||||
|
||||
|
||||
def printvecotorization(de_corpus, ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_entities=True):
|
||||
logprint(str("ngrams: {0}".format(ngrams)))
|
||||
logprint(str("min_df: {0}".format(min_df)))
|
||||
logprint(str("max_df: {0}".format(max_df)))
|
||||
logprint(str("named_entities: {0}".format(named_entities)))
|
||||
|
||||
# printlog("vectorize corpi...")
|
||||
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
|
||||
|
@ -48,19 +56,20 @@ def printvecotorization(de_corpus,ngrams=1, min_df=1, max_df=1.0, weighting='tf'
|
|||
|
||||
for t in terms_list:
|
||||
print(t)
|
||||
printlog("doc_term_matrix: {0}".format(doc_term_matrix))
|
||||
printlog("id2term: {0}".format(id2term))
|
||||
logprint("doc_term_matrix: {0}".format(doc_term_matrix))
|
||||
logprint("id2term: {0}".format(id2term))
|
||||
|
||||
def textacyTopicModeling(ngrams, min_df, max_df, corpus, n_topics, topicModel='lda',named_entities=False):
|
||||
printlog(
|
||||
|
||||
def textacyTopicModeling(ngrams, min_df, max_df, corpus, n_topics, topicModel='lda', named_entities=False):
|
||||
logprint(
|
||||
"############################################ Topic Modeling {0} #############################################".format(
|
||||
topicModel))
|
||||
print("\n\n")
|
||||
printlog(str("ngrams: {0}".format(ngrams)))
|
||||
printlog(str("min_df: {0}".format(min_df)))
|
||||
printlog(str("max_df: {0}".format(max_df)))
|
||||
printlog(str("n_topics: {0}".format(n_topics)))
|
||||
printlog(str("named_entities: {0}".format(named_entities)))
|
||||
logprint(str("ngrams: {0}".format(ngrams)))
|
||||
logprint(str("min_df: {0}".format(min_df)))
|
||||
logprint(str("max_df: {0}".format(max_df)))
|
||||
logprint(str("n_topics: {0}".format(n_topics)))
|
||||
logprint(str("named_entities: {0}".format(named_entities)))
|
||||
|
||||
start = time.time()
|
||||
|
||||
|
@ -98,13 +107,13 @@ def textacyTopicModeling(ngrams, min_df, max_df, corpus, n_topics, topicModel='
|
|||
print()
|
||||
|
||||
for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words):
|
||||
printlog('topic {0}: {1}'.format(topic_idx, " ".join(top_terms)))
|
||||
logprint('topic {0}: {1}'.format(topic_idx, " ".join(top_terms)))
|
||||
|
||||
print()
|
||||
for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic):
|
||||
printlog(topic_idx)
|
||||
logprint(topic_idx)
|
||||
for j in top_docs:
|
||||
printlog(corpus[j].metadata['categoryName'])
|
||||
logprint(corpus[j].metadata['categoryName'])
|
||||
print()
|
||||
|
||||
#####################################################################################################################
|
||||
|
@ -112,100 +121,142 @@ def textacyTopicModeling(ngrams, min_df, max_df, corpus, n_topics, topicModel='
|
|||
print()
|
||||
|
||||
end = time.time()
|
||||
printlog("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel))
|
||||
logprint("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel))
|
||||
|
||||
|
||||
def jgibbsLLDA(de_corpus, top_topic_words):
|
||||
def jgibbsLLDA(corpus, path2save_results, top_topic_words=15, add_default_topic=False):
|
||||
##################### LLDA Topic Modeling via JGibbsLabledLDA ##############################################
|
||||
|
||||
start = time.time()
|
||||
|
||||
def label2ID(label, labeldict):
|
||||
return labeldict.get(label, len(labeldict))
|
||||
|
||||
def generate_labled_lines(textacyCorpus,labeldict):
|
||||
for doc in textacyCorpus:
|
||||
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
|
||||
yield "[" + str(label2ID(doc.metadata["categoryName"],labeldict)) + "] " + doc.text
|
||||
|
||||
# build citionary of ticketcategories
|
||||
labelist = []
|
||||
|
||||
for texdoc in de_corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist):
|
||||
for texdoc in corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist):
|
||||
labelist.append(texdoc.metadata["categoryName"])
|
||||
|
||||
labeldict = {k: v for v, k in enumerate(labelist)}
|
||||
|
||||
n_topics = len(labeldict) + 1 # len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
|
||||
if add_default_topic:
|
||||
n_topics = len(labeldict) + 1 # len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
|
||||
else:
|
||||
n_topics = len(labeldict) # + 1 # len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
|
||||
|
||||
jgibbsLLDA_root = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/java_LabledLDA/"
|
||||
jgibbsLLDA_root = FILEPATH + "/java_LabledLDA/"
|
||||
|
||||
LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
|
||||
dict_path = "{0}models/tickets/labeldict.txt".format(jgibbsLLDA_root)
|
||||
|
||||
|
||||
#printlog(str("LABELDICT: {0}".format(labeldict)))
|
||||
printlog(str("LABELDICT-length: {0}".format(len(labeldict))))
|
||||
# printlog(str("LABELDICT: {0}".format(labeldict)))
|
||||
#logprint(str("LABELDICT-length: {0}".format(len(labeldict))))
|
||||
with open(dict_path, 'w') as file:
|
||||
file.write(json.dumps(labeldict))
|
||||
|
||||
#for line in generate_labled_lines(de_corpus,labeldict):
|
||||
# for line in generate_labled_lines(de_corpus,labeldict):
|
||||
# print(line)
|
||||
|
||||
# create file
|
||||
textacy.fileio.write_file_lines(generate_labled_lines(de_corpus,labeldict), filepath=LLDA_filepath)
|
||||
textacy.fileio.write_file_lines(generate_labled_lines(corpus, labeldict), filepath=LLDA_filepath)
|
||||
|
||||
# wait for file to exist
|
||||
while not os.path.exists(LLDA_filepath):
|
||||
time.sleep(1)
|
||||
"""
|
||||
printlog("")
|
||||
printlog("start LLDA:")
|
||||
|
||||
logprint("")
|
||||
logprint("start LLDA:")
|
||||
# run JGibsslda file
|
||||
FNULL = open(os.devnull, 'w') # supress output
|
||||
subprocess.call(["java",
|
||||
"-cp",
|
||||
"{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format(
|
||||
jgibbsLLDA_root),
|
||||
"jgibblda.LDA",
|
||||
"-est",
|
||||
"-dir", "{0}models/tickets".format(jgibbsLLDA_root),
|
||||
"-dfile", "tickets.gz",
|
||||
"-twords", str(top_topic_words),
|
||||
"-ntopics", str(n_topics)], stdout=FNULL)
|
||||
cmd_jgibbs_java = ["java", "-cp",
|
||||
"{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format(
|
||||
jgibbsLLDA_root),
|
||||
"jgibblda.LDA", "-est", "-dir", "{0}models/tickets".format(jgibbsLLDA_root), "-dfile",
|
||||
"tickets.gz",
|
||||
"-twords", str(top_topic_words), "-ntopics", str(n_topics)]
|
||||
subprocess.call(cmd_jgibbs_java, stdout=FNULL)
|
||||
|
||||
# ANMERKUNG: Dateien sind versteckt. zu finden in models/
|
||||
|
||||
# twords
|
||||
"""
|
||||
subprocess.call(["gzip",
|
||||
"-dc",
|
||||
"{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)])
|
||||
#####################################################################################################################
|
||||
printlog("")
|
||||
"""
|
||||
|
||||
cmd_gzip = ["gzip", "-dc", "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)]
|
||||
"""
|
||||
proc = subprocess.Popen(cmd_gzip, stdout=subprocess.PIPE)
|
||||
|
||||
process = subprocess.Popen(cmd_gzip, shell=True,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
|
||||
# wait for the process to terminate
|
||||
out, err = process.communicate()
|
||||
errcode = process.returncode
|
||||
|
||||
result = subprocess.check_output(cmd_gzip)
|
||||
|
||||
#result = proc.stdout.read()
|
||||
result = proc.communicate()
|
||||
out=[]
|
||||
for line in result:
|
||||
out.append(line)
|
||||
"""
|
||||
|
||||
output = subprocess.check_output(cmd_gzip).decode("utf-8")
|
||||
|
||||
reverse_labeldict = {v: k for k, v in labeldict.items()}
|
||||
result = []
|
||||
regex = re.compile(r'Topic [0-9]')
|
||||
for line in output.splitlines():
|
||||
|
||||
findall = regex.findall(line)
|
||||
if len(findall) != 0:
|
||||
try:
|
||||
index = int(findall[0].split()[1])
|
||||
result.append("Topic {} {}:".format(index, reverse_labeldict[index]))
|
||||
|
||||
except:
|
||||
result.append(line)
|
||||
|
||||
else:
|
||||
result.append(line)
|
||||
|
||||
textacy.fileio.write_file_lines(result, path2save_results)
|
||||
#####################################################################################################################
|
||||
logprint("")
|
||||
|
||||
end = time.time()
|
||||
printlog("\n\n\nTime Elapsed Topic Modeling JGibbsLLDA:{0} min\n\n".format((end - start) / 60))
|
||||
logprint("\n\n\nTime Elapsed Topic Modeling JGibbsLLDA:{0} min\n\n".format((end - start) / 60))
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
printlog("Topic Modeling: {0}".format(datetime.now()))
|
||||
def main(use_raw=False):
|
||||
logprint("Topic Modeling: {0}".format(datetime.now()))
|
||||
|
||||
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
|
||||
|
||||
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
|
||||
if use_raw:
|
||||
preCorpus_name = "de" + "_raw_ticket"
|
||||
else:
|
||||
preCorpus_name = "de" + "_pre_ticket"
|
||||
|
||||
preCorpus_name = "de" + "_pre_ticket"
|
||||
|
||||
#load raw corpus and create new one
|
||||
# load raw corpus and create new one
|
||||
de_corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path)
|
||||
printlog("Corpus loaded: {0}".format(de_corpus.lang))
|
||||
logprint("Corpus loaded: {0}".format(de_corpus.lang))
|
||||
|
||||
#idee http://bigartm.org/
|
||||
#idee http://wiki.languagetool.org/tips-and-tricks
|
||||
# idee http://bigartm.org/
|
||||
# idee http://wiki.languagetool.org/tips-and-tricks
|
||||
# idee https://en.wikipedia.org/wiki/Noisy_text_analytics
|
||||
# idee https://gate.ac.uk/family/
|
||||
|
||||
# todo gescheites tf(-idf) maß finden
|
||||
# todo pro model: gelabelten corpus, ergebnisse und labeldict speichern
|
||||
# todo topics zusammenfassen
|
||||
# frage wieviele tickets pro topic?
|
||||
|
||||
|
||||
ngrams = 1
|
||||
min_df = 1
|
||||
max_df = 1.0
|
||||
|
@ -213,7 +264,6 @@ def main():
|
|||
# weighting ='tfidf'
|
||||
named_entities = False
|
||||
|
||||
|
||||
"""
|
||||
printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting=weighting)
|
||||
printvecotorization(ngrams=1, min_df=1, max_df=0.5, weighting=weighting)
|
||||
|
@ -224,8 +274,34 @@ def main():
|
|||
printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.8, weighting=weighting)
|
||||
"""
|
||||
|
||||
if use_raw:
|
||||
resultspath = FILEPATH + "results/raw"
|
||||
else:
|
||||
resultspath = FILEPATH + "results/pre"
|
||||
|
||||
jgibbsLLDA(de_corpus,15)
|
||||
top_topic_words = 5
|
||||
add_default_topic = False
|
||||
path2save_results = resultspath + "{}_{}.txt".format(top_topic_words, add_default_topic)
|
||||
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
|
||||
add_default_topic=add_default_topic)
|
||||
|
||||
top_topic_words = 5
|
||||
add_default_topic = True
|
||||
path2save_results = resultspath + "{}_{}.txt".format(top_topic_words, add_default_topic)
|
||||
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
|
||||
add_default_topic=add_default_topic)
|
||||
|
||||
top_topic_words = 10
|
||||
add_default_topic = False
|
||||
path2save_results = resultspath + "{}_{}.txt".format(top_topic_words, add_default_topic)
|
||||
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
|
||||
add_default_topic=add_default_topic)
|
||||
|
||||
top_topic_words = 10
|
||||
add_default_topic = True
|
||||
path2save_results = resultspath + "{}_{}.txt".format(top_topic_words, add_default_topic)
|
||||
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
|
||||
add_default_topic=add_default_topic)
|
||||
|
||||
# no_below = 20
|
||||
# no_above = 0.5
|
||||
|
@ -280,7 +356,6 @@ def main():
|
|||
"""
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
|
|
Loading…
Reference in New Issue