last commit. zu verworren geworden
neue version ist IMTC_TopicModeling
This commit is contained in:
parent
412f25d8d8
commit
1ae9d00c16
91
cleaning.py
91
cleaning.py
|
@ -49,15 +49,20 @@ def clean(stringstream):#, NOUNS):
|
||||||
#string = textacy.preprocess.unidecode(string)
|
#string = textacy.preprocess.unidecode(string)
|
||||||
|
|
||||||
# seperate_words_on_regex:
|
# seperate_words_on_regex:
|
||||||
string = " ".join(re.compile(r'[`\=~%^&*()_+\[\]{};\'"|</>]').split(string))
|
string = " ".join(re.compile(r'[`\=~%^&*()_+\[\]{};\'"|</>]').split(string)) #todo bla vllt lassen wir das hier? achaj: für header und footer vllt englische-spracherkennung und adressen parsing und grußfromelerkennung
|
||||||
|
|
||||||
#normalize whitespace
|
#normalize whitespace
|
||||||
string = textacy.preprocess.normalize_whitespace(string)
|
string = textacy.preprocess.normalize_whitespace(string)
|
||||||
|
|
||||||
#remove linebreaks
|
#todo bla textacy.preprocess.remove_accents(text, method=u'unicode')[source]
|
||||||
string = re.sub(r'[\n]', " ", string)
|
|
||||||
|
|
||||||
string = replaceRockDots(string)
|
|
||||||
|
|
||||||
|
|
||||||
|
#remove linebreaks
|
||||||
|
string = re.sub(r'[\n]', " ", string) #todo bla kann/soll raus? weil absätze vllt weas zu bedeuten haben
|
||||||
|
|
||||||
|
string = replaceRockDots(string) #todo bla gehört zu normalize
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# fehler großschreibung durch nomenliste zu korrigieren funzt nicht so richtig, da auch innerhalb des Statzes wörter verändert werden.
|
# fehler großschreibung durch nomenliste zu korrigieren funzt nicht so richtig, da auch innerhalb des Statzes wörter verändert werden.
|
||||||
|
@ -91,13 +96,70 @@ def clean(stringstream):#, NOUNS):
|
||||||
yield string
|
yield string
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def processDictstream_v2(dictstream, keys_to_clean):
|
||||||
|
for dic in dictstream:
|
||||||
|
|
||||||
|
result = {k: re.sub(r'[.!?]', "", normalize_str(v).lower()) if k in keys_to_clean else v for k, v in dic.items()}
|
||||||
|
yield result
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def processDictstream(dictstream, funcdict, parser):
|
||||||
|
"""
|
||||||
|
|
||||||
|
:param dictstream: dict-gen
|
||||||
|
:param funcdict:
|
||||||
|
clean_in_meta = {
|
||||||
|
"Solution":funclist,
|
||||||
|
...
|
||||||
|
}
|
||||||
|
|
||||||
|
:param parser: spacy-parser
|
||||||
|
:return: dict-gen
|
||||||
|
"""
|
||||||
|
for dic in dictstream:
|
||||||
|
result = {}
|
||||||
|
for key, value in dic.items():
|
||||||
|
|
||||||
|
if key in funcdict:
|
||||||
|
|
||||||
|
doc = parser(value)
|
||||||
|
tokens = [tok for tok in doc]
|
||||||
|
funclist = funcdict[key]
|
||||||
|
|
||||||
|
tokens = filterTokens(tokens, funclist)
|
||||||
|
|
||||||
|
result[key] = " ".join([tok.lower_ for tok in tokens])
|
||||||
|
|
||||||
|
|
||||||
|
else:
|
||||||
|
result[key] = value
|
||||||
|
yield result
|
||||||
|
|
||||||
|
def filterTokens(tokens, funclist):
|
||||||
|
# in:tokenlist, funclist
|
||||||
|
# out: tokenlist
|
||||||
|
for f in funclist:
|
||||||
|
tokens = list(filter(f, tokens))
|
||||||
|
|
||||||
|
for tok in tokens:
|
||||||
|
if tok.pos_ == "NOUN":
|
||||||
|
x = 0
|
||||||
|
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
##################################################################################################
|
##################################################################################################
|
||||||
|
|
||||||
|
|
||||||
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
|
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
|
||||||
|
|
||||||
|
|
||||||
def cleanCorpus(corpus):
|
def cleanCorpus(corpus,clean_in_meta):
|
||||||
logprint("Clean {0}_corpus at {1}".format(corpus.lang, datetime.now()))
|
logprint("Clean {0}_corpus at {1}".format(corpus.lang, datetime.now()))
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
@ -122,9 +184,12 @@ def cleanCorpus(corpus):
|
||||||
|
|
||||||
# Actually clean the corpus
|
# Actually clean the corpus
|
||||||
cleaned_corpus = textacy.Corpus(parser)
|
cleaned_corpus = textacy.Corpus(parser)
|
||||||
|
|
||||||
cleaned_corpus.add_texts(
|
cleaned_corpus.add_texts(
|
||||||
clean(corpus2Text(raw_corpus)),
|
clean(corpus2Text(raw_corpus)),
|
||||||
corpus2Meta(raw_corpus)
|
#processDictstream(corpus2Meta(cleaned_corpus), clean_in_meta, parser=parser)
|
||||||
|
|
||||||
|
processDictstream_v2(corpus2Meta(raw_corpus),clean_in_meta)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -143,14 +208,26 @@ def cleanCorpus(corpus):
|
||||||
return cleaned_corpus
|
return cleaned_corpus
|
||||||
|
|
||||||
|
|
||||||
|
def removePOS(pos_list):
|
||||||
|
return lambda tok: tok.pos_ not in pos_list
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def main(corpus):
|
def main(corpus):
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
clean_in_meta = {
|
||||||
|
"Solution": [removePOS(["SPACE"])],
|
||||||
|
"Subject": [removePOS(["SPACE", "PUNCT"])],
|
||||||
|
"categoryName": [removePOS(["SPACE", "PUNCT"])]
|
||||||
|
}
|
||||||
|
|
||||||
cleaned_corpus = cleanCorpus(corpus)
|
clean_in_meta = ["Subject", "categoryName" ]
|
||||||
|
|
||||||
|
|
||||||
|
cleaned_corpus = cleanCorpus(corpus, clean_in_meta)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -67,7 +67,7 @@ metaliste=TicketNumber,Subject,CreatedDate,categoryName,Impact,Urgency,BenutzerI
|
||||||
autocorrect = false
|
autocorrect = false
|
||||||
#true
|
#true
|
||||||
|
|
||||||
custom_words=aenderung,hahn,verantwortlicher,rolle,status,fehlgeschlagen,aenderung,test,erwuenscht,antragsteller,bemerkung,tu,uni,prof,bezeichnung,gramm,type,eintrag,element,nutzer,einrichtung,abteilung,gebaeude,raum,ansprechpartner,geehrt,dr,not,frage,betreff,gerne,dame,herr,frau,hilfe,moeglichkeit,beste,freuen,voraus,problem,lauten,bedanken,voraus,hallo,gerne,freundlich,fragen,fehler,bitten,ehre,lieb,liebe,gruesse,helfen,versuchen,unbestimmt,woche,tadelos,klappen,mittlerweile,bekommen,erreichbar,gruss,auffahren,vorgang,hinweis,name,gruss,id,erfolg,folge,team,absender,versenden,vorname,strasse,prozess,portal,moeglichkeit,fremd,wende,rueckfrage,stehen,verfuegung,funktionieren,pruefen,hoffen,ok,januar,februar,maerz,april,mai,juni,juli,august,september,oktober,november,dezember
|
custom_words=problem,without,aenderung,hahn,verantwortlicher,rolle,status,fehlgeschlagen,aenderung,test,erwuenscht,antragsteller,bemerkung,tu,uni,prof,bezeichnung,gramm,type,eintrag,element,nutzer,einrichtung,abteilung,gebaeude,raum,ansprechpartner,geehrt,dr,not,frage,betreff,gerne,dame,herr,frau,hilfe,moeglichkeit,beste,freuen,voraus,problem,lauten,bedanken,voraus,hallo,gerne,freundlich,fragen,fehler,bitten,ehre,lieb,liebe,gruesse,helfen,versuchen,unbestimmt,woche,tadelos,klappen,mittlerweile,bekommen,erreichbar,gruss,auffahren,vorgang,hinweis,name,gruss,id,erfolg,folge,team,absender,versenden,vorname,strasse,prozess,portal,moeglichkeit,fremd,wende,rueckfrage,stehen,verfuegung,funktionieren,pruefen,hoffen,ok,januar,februar,maerz,april,mai,juni,juli,august,september,oktober,november,dezember
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,24 @@
|
||||||
|
[neuanschluss] telefon dezernat uniaccount pavillon telefonbuch mail amt telefon benoetigt telefon speicherpl.f.die ausreichend moebel aufgestellt.weder
|
||||||
|
[neuanschluss] telefon dekanat bereich studium lehre uniaccount ct g2 telefonbuch mail amt anschluss faxgeraet rufnummer
|
||||||
|
[lan] service netzwerkanschluss kollegen intranet mail kollegen leitung vpn verbindung intranet netzwerk wlan netz netzwerk mitteilen wenden
|
||||||
|
[betrieb] support unicard browser service wochen
|
||||||
|
[elektronisches telefonbuch] telefon umzug astrid.gramm@tu-dortmund.de dezernat uniaccount dezernat telefonbuch mail
|
||||||
|
[verwaltung] laptop klaerung dezernat organisationsentwicklung mail
|
||||||
|
[umzug] telefon umzug uniaccount chemie telefonbuch mail anschluss fakultaet berufung telefonanlage gesondert b.
|
||||||
|
[umzug] telefon umzug uniaccount chemie telefonbuch mail anschluss fax anschluss
|
||||||
|
[umzug] telefon umzug uniaccount chemie telefonbuch mail anschluss hochschullehrer fakultaet telefonanlage anschlusses
|
||||||
|
[lsf] service semester semester stelle studium wahlmodulen experimentelle webseite erstellen vision stufen kommilitonen informatik fakultaet erfinden sichtweise daten boss schaffen aspekt studienplan uebersicht aktuelle ansicht semester modul stufe veranstaltungsinformationen studiengang fakultaet dozent semester turnus orientierung anhand semester automatisiert erstellen datenbank dozent turnus uhrzeit beschreibung boss programmierschnittstelle datenabfrage login benutzername passwort einzelne benutzer erlaubnis liste boss nummer format xml pdf gespraech klaeren
|
||||||
|
[sap] mitarbeiter schulung berechtigung budget kennung ort schulung
|
||||||
|
[gastaufenthalt] damen pruefung email adresse honorarprofessor ing vorlesungen bereich satellitenkommunikation fakultaet elektrotechnik informationstechnik mitarbeiter lehrstuhl hochfrequenztechnik lehrstuhl email adresse sinnvolle kommunikation hilfsmittel ews sinne email adresse
|
||||||
|
[sap] schulung dezernat zuhoeren berechtigung budget lage account
|
||||||
|
[fk raumplanung 09] pc modus
|
||||||
|
[sap] kolleginnen kollegen schulung anfaenger verwendung feld dezentral zugreifen uebersicht alternative budget berechtigung transaktionen fb60 dezernat sekretariaten kuerze fk05 statistik einsatz korrektur kurze rueckmeldung freischaltung einrichtungen
|
||||||
|
[fiona] mitarbeiter fachgebiet regionalsoziologie fakultaet raumplanung fachgebiet alte homepage homepage erscheinungsbild aktuell kenne programm umstellung einstiegsschulung vornehmen besprechen taeglich buero erreichen bescheid weber gb iii raumplanung waehlen mithilfe
|
||||||
|
[fk 12] hi zugang fk12-adresse aendern
|
||||||
|
[uniaccount] meldung zugangsdaten passwort rechtzeitig zugang problemlos account
|
||||||
|
[elektronisches telefonbuch] telefon umzug lehrstuhl uniaccount physik telefonbuch mail nr mitnehmen
|
||||||
|
[abmeldung] telefon abmeldung uniaccount telefonbuch mail besitzer nr
|
||||||
|
[telefon] telefon geraet display defekt telefon wenden -5886
|
||||||
|
[neuanschluss] telefon leitung uniaccount telefonbuch mail amt telefon auszubildende sekretariat azubi sekretariat
|
||||||
|
[uni mail] kenntnisnahme loesung alte passwort aendern erklaert passwort buero server absturz problemlos passwort unabhaengig telefonats service geloest erstmal rueckmeldung vorgehensweise kollegen geloest service antrag dienstreise passwort alte passwort mail dienstreisen antrag passwort system unding offenbar it sachverhalt systemausfall wochen reibungslos
|
||||||
|
[uni mail] webmailer text einfuegen
|
|
@ -60,7 +60,7 @@ def ticket_csv_to_DictStream(path2csv,content_collumn_name):
|
||||||
for i, lst in enumerate(stream):
|
for i, lst in enumerate(stream):
|
||||||
if i == 0:
|
if i == 0:
|
||||||
for j, col in enumerate(lst):
|
for j, col in enumerate(lst):
|
||||||
if "icketNumb" in col:
|
if "icketNumb" in col: #korrigieren der .csv todo wenn hier sowieso hardgecodet werden muss, dann gleich auch config.ini raus?
|
||||||
col = "TicketNumber"
|
col = "TicketNumber"
|
||||||
metalist.append(str(col))
|
metalist.append(str(col))
|
||||||
metaindices.append(j)
|
metaindices.append(j)
|
||||||
|
@ -89,7 +89,7 @@ corpus_de_path = FILEPATH + config.get("de_corpus", "path")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, lang, printrandom=0):
|
def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, lang, printrandom=0): #todo bla das kann hier die main sein
|
||||||
"""
|
"""
|
||||||
Use textacy to create a Corpus out of the ITMC-Ticket.csv
|
Use textacy to create a Corpus out of the ITMC-Ticket.csv
|
||||||
|
|
||||||
|
@ -105,18 +105,20 @@ def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, lang, printra
|
||||||
path_csv_split = path2_csv.split("/")
|
path_csv_split = path2_csv.split("/")
|
||||||
filename = path_csv_split[len(path_csv_split) - 1]
|
filename = path_csv_split[len(path_csv_split) - 1]
|
||||||
|
|
||||||
logprint("Corporization of {0} at {1}".format(filename, datetime.now()))
|
logprint("Corporization of {0}".format(filename))#, datetime.now()))
|
||||||
|
|
||||||
|
|
||||||
raw_corpus = textacy.Corpus(lang)
|
raw_corpus = textacy.Corpus(lang)
|
||||||
|
|
||||||
## add files to textacy-corpi,
|
## add files to textacy-corpi, todo bla hier cleanen, dict nich vergessn
|
||||||
raw_corpus.add_texts(
|
raw_corpus.add_texts(
|
||||||
ticketcsv_to_textStream(path2_csv, content_collumn_name),
|
ticketcsv_to_textStream(path2_csv, content_collumn_name),
|
||||||
ticket_csv_to_DictStream(path2_csv,content_collumn_name)
|
ticket_csv_to_DictStream(path2_csv,content_collumn_name)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# leere docs aus corpi kicken
|
# leere docs aus corpi kicken
|
||||||
raw_corpus.remove(lambda doc: len(doc) == 0)
|
raw_corpus.remove(lambda doc: len(doc) == 0)
|
||||||
|
|
||||||
|
|
2
init.py
2
init.py
|
@ -281,7 +281,7 @@ def build_words_for_spellchecking(path2words):
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
start = time.time()
|
start = time.time()
|
||||||
logprint("Init: {0}".format(datetime.now()))
|
logprint("Init")#: {0}".format(datetime.now()))
|
||||||
|
|
||||||
ressources_path = FILEPATH + "ressources/"
|
ressources_path = FILEPATH + "ressources/"
|
||||||
|
|
||||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
18
main.py
18
main.py
|
@ -27,13 +27,13 @@ start = time.time()
|
||||||
# idee lda so trainieren, dass zuordnung term <-> topic nicht zu schwach wird, aber möglichst viele topics
|
# idee lda so trainieren, dass zuordnung term <-> topic nicht zu schwach wird, aber möglichst viele topics
|
||||||
# frage welche mitarbeiter bearbeiteten welche Topics? idee topics mit mitarbeiternummern erstzen
|
# frage welche mitarbeiter bearbeiteten welche Topics? idee topics mit mitarbeiternummern erstzen
|
||||||
# idee word vorher mit semantischen netz abgleichen: wenn zu weit entfernt, dann ignore
|
# idee word vorher mit semantischen netz abgleichen: wenn zu weit entfernt, dann ignore
|
||||||
|
# idee lda2vec
|
||||||
# todo modelle testen
|
# todo modelle testen
|
||||||
|
|
||||||
# todo ticket2kbkeys, subj, cats in init.py
|
|
||||||
|
|
||||||
|
|
||||||
logprint("main.py started at {}".format(datetime.now()))
|
|
||||||
|
logprint("main.py started")
|
||||||
|
|
||||||
|
|
||||||
init.main()
|
init.main()
|
||||||
|
@ -45,9 +45,15 @@ logprint("")
|
||||||
cleaned_corpus = cleaning.main(raw_corpus)
|
cleaned_corpus = cleaning.main(raw_corpus)
|
||||||
logprint("")
|
logprint("")
|
||||||
|
|
||||||
pre_corpus = preprocessing.main(cleaned_corpus)
|
doc_term_matrix, id2term_dict = preprocessing.main(cleaned_corpus)
|
||||||
logprint("")
|
logprint("")
|
||||||
|
|
||||||
|
topicModeling.textacyTopicModeling_v2(doc_term_matrix, id2term_dict)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
ticket_number = "INC40484"
|
ticket_number = "INC40484"
|
||||||
raw=""
|
raw=""
|
||||||
|
@ -89,11 +95,11 @@ logprint("")
|
||||||
logprint("")
|
logprint("")
|
||||||
|
|
||||||
"""
|
"""
|
||||||
topicModeling.main(pre_corpus=pre_corpus,cleaned_corpus=cleaned_corpus,algorithm="llda")
|
#topicModeling.main(pre_corpus=pre_corpus,cleaned_corpus=cleaned_corpus,algorithm="llda")
|
||||||
logprint("")
|
logprint("")
|
||||||
|
|
||||||
|
|
||||||
topicModeling.main(pre_corpus=pre_corpus,cleaned_corpus=cleaned_corpus,algorithm="lda")
|
#topicModeling.main(pre_corpus=pre_corpus,cleaned_corpus=cleaned_corpus,algorithm="lda")
|
||||||
logprint("")
|
logprint("")
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
from datetime import datetime
|
||||||
import configparser as ConfigParser
|
import configparser as ConfigParser
|
||||||
import csv
|
import csv
|
||||||
import functools
|
import functools
|
||||||
|
@ -47,6 +48,7 @@ logging.basicConfig(filename=filename, level=level)
|
||||||
|
|
||||||
def logprint(string, level="INFO"):
|
def logprint(string, level="INFO"):
|
||||||
"""log and prints"""
|
"""log and prints"""
|
||||||
|
string = "{}\t".format(datetime.now()) + str(string)
|
||||||
print(string)
|
print(string)
|
||||||
if level == "INFO":
|
if level == "INFO":
|
||||||
logging.info(string)
|
logging.info(string)
|
||||||
|
@ -145,14 +147,14 @@ def sort_dictionary(dict):
|
||||||
return sorted(dict.items(), key=operator.itemgetter(1))
|
return sorted(dict.items(), key=operator.itemgetter(1))
|
||||||
|
|
||||||
|
|
||||||
def normalize(string):
|
def normalize_str(string):
|
||||||
# replaceRockDots
|
"""
|
||||||
string = re.sub(r'[ß]', "ss", string.lower())
|
replaceRockDots
|
||||||
string = re.sub(r'[ö]', "oe", string)
|
textacy.preprocess.normalize_whitespace
|
||||||
string = re.sub(r'[ü]', "ue", string)
|
:param string: str
|
||||||
string = re.sub(r'[ä]', "ae", string)
|
:return: str
|
||||||
string = textacy.preprocess.normalize_whitespace(string)
|
"""
|
||||||
return string
|
return textacy.preprocess.normalize_whitespace(replaceRockDots(string))
|
||||||
|
|
||||||
|
|
||||||
def deprecated(func):
|
def deprecated(func):
|
||||||
|
@ -200,14 +202,18 @@ def corpus2Meta(corpus):
|
||||||
for doc in corpus:
|
for doc in corpus:
|
||||||
yield doc.metadata
|
yield doc.metadata
|
||||||
|
|
||||||
def savelabledCorpiLines(corpus,filepath):
|
def savelabledCorpiLines_cat(corpus, filepath):
|
||||||
|
|
||||||
textacy.fileio.write_file_lines(gen_labledLines(corpus), filepath=filepath)
|
textacy.fileio.write_file_lines(gen_labledLines(corpus), filepath=filepath)
|
||||||
|
|
||||||
def gen_labledLines(corpus):
|
|
||||||
|
|
||||||
|
def gen_labledLines(corpus, label ="categoryName"):
|
||||||
for doc in corpus:
|
for doc in corpus:
|
||||||
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
|
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
|
||||||
yield "[" + doc.metadata["categoryName"] + "] " + doc.text
|
yield "[" + doc.metadata[label] + "] " + doc.text
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def save_corpus(corpus, corpus_path, corpus_name):
|
def save_corpus(corpus, corpus_path, corpus_name):
|
||||||
|
@ -235,6 +241,8 @@ def gen_dicts(corpus):
|
||||||
dict.update(doc.metadata)
|
dict.update(doc.metadata)
|
||||||
yield dict
|
yield dict
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def multisub(subs, subject):
|
def multisub(subs, subject):
|
||||||
#https://stackoverflow.com/questions/764360/a-list-of-string-replacements-in-python
|
#https://stackoverflow.com/questions/764360/a-list-of-string-replacements-in-python
|
||||||
"Simultaneously perform all substitutions on the subject string."
|
"Simultaneously perform all substitutions on the subject string."
|
||||||
|
|
868
preprocessing.py
868
preprocessing.py
File diff suppressed because it is too large
Load Diff
207
test.py
207
test.py
|
@ -14,17 +14,220 @@ from scipy import *
|
||||||
import json
|
import json
|
||||||
import draw
|
import draw
|
||||||
"""
|
"""
|
||||||
|
import matplotlib
|
||||||
|
matplotlib.use('Agg')
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
from textacy import Vectorizer
|
||||||
|
from itertools import tee
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
|
||||||
from gensim.models import Doc2Vec
|
from gensim.models import Doc2Vec
|
||||||
|
from datetime import datetime
|
||||||
import textacy
|
import textacy
|
||||||
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
|
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
|
||||||
from miscellaneous import *
|
from miscellaneous import *
|
||||||
|
from ressources.iir.lda.llda import *
|
||||||
|
import numpy as np
|
||||||
import re
|
import re
|
||||||
|
import draw
|
||||||
|
# http://universaldependencies.org/u/pos/
|
||||||
|
|
||||||
|
#corpus, parser = load_corpus(corpus_path="/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/", corpus_name="de_clean")
|
||||||
|
|
||||||
|
# array of zeros and ones interleaved
|
||||||
|
lrg = np.arange(2).reshape((2,-1)).repeat(1000000,-1).flatten()
|
||||||
|
|
||||||
|
flt = lrg[lrg==0]
|
||||||
|
|
||||||
|
flt = np.array(filter(lambda x:x==0, lrg))
|
||||||
|
|
||||||
|
lines_txt = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/corpi_vor_besprechung/labeled.txt"
|
||||||
|
|
||||||
|
labelset, corpus, labels = load_corp(lines_txt)
|
||||||
|
|
||||||
|
llda = LLDA(20, 0.001, 0.001)
|
||||||
|
llda.set_corpus(labelset, corpus, labels)
|
||||||
|
|
||||||
|
for i in range(10):
|
||||||
|
llda.inference()
|
||||||
|
|
||||||
|
phi = llda.phi()
|
||||||
|
#print(llda.vocas)
|
||||||
|
|
||||||
|
#for v, voca in enumerate(llda.vocas):
|
||||||
|
# print ','.join([voca]+[str(x) for x in llda.n_z_t[:,v]])
|
||||||
|
#print(','.join([voca] + [str(x) for x in phi[:, v]]))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
################# termite plot #####################################################################
|
||||||
|
topic_labels = list(labelset)
|
||||||
|
term_labels = list(llda.vocas)
|
||||||
|
|
||||||
|
|
||||||
|
term_topic_weights = phi.transpose()
|
||||||
|
|
||||||
|
threshmin = 0.005
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
from scipy.stats import threshold
|
||||||
|
|
||||||
|
thresholded = threshold(term_topic_weights, threshmin=threshmin)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
draw.draw_termite(thresholded, topic_labels, term_labels, save="test.png")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
exit()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
KBA10184_text = "wenn Sie Ihr UniAccount Passwort ändern möchten, k<>nnen Sie dies im Service Portal unter folgendem Link durchführen: https://service.tu-dortmund.de/uniaccount-passwort"
|
||||||
|
|
||||||
|
|
||||||
|
corpus = textacy.Corpus("de")
|
||||||
|
preprocess_replace_urls = textacy.preprocess.replace_urls(KBA10184_text,replace_with="URL")
|
||||||
|
print(preprocess_replace_urls)
|
||||||
|
|
||||||
|
preprocess_replace_urls = textacy.preprocess.transliterate_unicode(KBA10184_text)
|
||||||
|
print(preprocess_replace_urls)
|
||||||
|
#corpus.add_text(preprocess_replace_urls)
|
||||||
|
|
||||||
|
to_corr = "Sehr geehrtes ITMC Service Team, seit ein einiger Zeit scheint der Netzwerkanschluss eines Kollegen " \
|
||||||
|
"An das Intranet der BMP Mit der Dosennummer G1 303 04 12.05 G1 4 26-1 in Raum G1-426 nicht Mehr Zu funktionieren. " \
|
||||||
|
"Ich Wuerde Sie daher bitten diese Mail An den zustaendigen Kollegen weiterzuleiten," \
|
||||||
|
" Um Die Leitung Vielleicht Einmal Zu Pruefen. Des Weiteren haette Ich noch Eine Frage " \
|
||||||
|
"bezueglich der Moeglichkeit zur Nutzung Einer VPN Verbindung Aus unserem Intranet" \
|
||||||
|
" heraus Zu einem fremden Netzwerk. Dies ist zwar Ueber das WLAN-Netz moeglich, jedoch nicht Aus unserem Netzwerk heraus."
|
||||||
|
|
||||||
|
to_corr = "Wichtiger Hinweis: Die Information in dieser E-Mail ist vertraulich. Sie ist ausschließlich für den Adressaten bestimmt. Sollten Sie nicht der für diese E-Mail bestimmte Adressat sein, unterrichten Sie bitte den Absender und vernichten Sie diese Mail. Vielen Dank. Unbeschadet der Korrespondenz per E-Mail, sind unsere Erklärungen ausschließlich final rechtsverbindlich, wenn sie in herkömmlicher Schriftform (mit eigenhändiger Unterschrift) oder durch Übermittlung eines solchen Schriftstücks per Telefax erfolgen."
|
||||||
|
corpus.add_text(to_corr)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
for doc in corpus:
|
||||||
|
for sent in doc.sents:
|
||||||
|
for tok in sent:
|
||||||
|
print(tok.text, tok.pos_, tok.dep_)
|
||||||
|
|
||||||
|
|
||||||
|
corpus.add_text("dfsaf fdsa fdsa",metadata={ "x" : "test"})
|
||||||
|
corpus.add_text("iuzizi gbjh iuzit",metadata={ "x" : "testo"})
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
vectorizer = Vectorizer(weighting="tf", min_df=1, max_df=1.0)
|
||||||
|
|
||||||
|
# create tuples of term_list and metadata for each doc
|
||||||
|
terms_meta_gen, terms_meta_gen_copy = tee(((doc.to_terms_list(ngrams=(1,2), as_strings=True), doc.metadata) for doc in corpus))
|
||||||
|
terms_list_gen, terms_list_gen_copy = tee((term_meta[0] for term_meta in terms_meta_gen))
|
||||||
|
|
||||||
|
doc_term_matrix = vectorizer.fit_transform(terms_list_gen)
|
||||||
|
id2term = vectorizer.id_to_term
|
||||||
|
term2id = vectorizer.vocabulary
|
||||||
|
|
||||||
|
for k,v in term2id.items():
|
||||||
|
print(k,doc_term_matrix[0,v])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#{0: 'dfsaf', 1: 'fdsa', 2: 'dfsaf fdsa', 3: 'fdsa fdsa', 4: 'iuzizi', 5: 'gbjh', 6: 'iuzit', 7: 'iuzizi gbjh', 8: 'gbjh iuzit'}
|
||||||
|
|
||||||
|
#line_gen = ( doc.metadata["x"] + " ".join([term for term in])
|
||||||
|
|
||||||
|
def gen_lines(docterm,term2id,corpus,label):
|
||||||
|
|
||||||
|
for i,doc in enumerate(corpus):
|
||||||
|
line = "[" + doc.metadata[label] + "]"
|
||||||
|
|
||||||
|
for term, id_ in term2id.items():
|
||||||
|
if doc_term_matrix[i, id_] != 0:
|
||||||
|
term = term if len(term.split()) == 1 else "_".join(term.split())
|
||||||
|
|
||||||
|
line = line + " " + term
|
||||||
|
|
||||||
|
yield line
|
||||||
|
|
||||||
|
for line in gen_lines(doc_term_matrix,term2id,corpus,"x"):
|
||||||
|
print(line)
|
||||||
|
|
||||||
|
|
||||||
|
#doc.to_terms_list(ngrams=2, as_strings=True)
|
||||||
|
|
||||||
|
# "" \
|
||||||
|
# "" \
|
||||||
|
# "" ".join(
|
||||||
|
#[term if term in id2term.values() else "" for term in terms_meta[0]]) for terms_meta in terms_meta_gen_copy)
|
||||||
|
|
||||||
|
|
||||||
|
label = "x"
|
||||||
|
|
||||||
|
#for line in line_gen:
|
||||||
|
# print(line)
|
||||||
|
|
||||||
|
#terms_meta_gen = ( (doc.to_terms_list(ngrams=2, as_strings=True),doc.metadata) for doc in corpus)
|
||||||
|
|
||||||
|
for x in terms_meta_gen:
|
||||||
|
print(x)
|
||||||
|
|
||||||
|
#terms_list_gen = (term_meta[0] for term_meta in terms_meta_gen)
|
||||||
|
|
||||||
|
for x in terms_list_gen:
|
||||||
|
print(x)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
for doc in corpus:
|
||||||
|
for term in doc.to_terms_list(ngrams=2, as_strings=True):
|
||||||
|
print(type(term))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
for doc in corpus:
|
||||||
|
for span in textacy.extract.ngrams(doc,2,
|
||||||
|
filter_stops=True, filter_punct=True,
|
||||||
|
filter_nums=False, include_pos=None,
|
||||||
|
exclude_pos=None, min_freq=1):
|
||||||
|
print(span.text)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
310
topicModeling.py
310
topicModeling.py
|
@ -66,14 +66,13 @@ def textacyTopicModeling(corpus,
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
###### vectorize corpi
|
###### vectorize corpi
|
||||||
|
|
||||||
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
|
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
|
||||||
|
|
||||||
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in corpus)
|
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in corpus)
|
||||||
doc_term_matrix = vectorizer.fit_transform(terms_list)
|
doc_term_matrix = vectorizer.fit_transform(terms_list)
|
||||||
id2term = vectorizer.__getattribute__("id_to_term")
|
#id2term = vectorizer.__getattribute__("id_to_term")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -113,8 +112,264 @@ def textacyTopicModeling(corpus,
|
||||||
logprint("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel))
|
logprint("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel))
|
||||||
|
|
||||||
|
|
||||||
|
def textacyTopicModeling_v2(doc_term_matrix, id_to_term,
|
||||||
|
n_topics = 15, top_topic_words = 3,
|
||||||
|
topicModel='lda'):
|
||||||
|
|
||||||
|
|
||||||
|
n_terms = int(n_topics * top_topic_words)
|
||||||
|
sort_terms_by = 'seriation' # 'seriation', 'weight', 'index', 'alphabetical'
|
||||||
|
rank_terms_by = 'corpus' # 'corpus', 'topic'
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
logprint("#### Topic Modeling {0}".format(topicModel))
|
||||||
|
logprint(str("n_topics: {0}".format(n_topics)))
|
||||||
|
logprint("\n")
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
|
||||||
|
# http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix
|
||||||
|
weighting = ('tf' if topicModel == 'lda' else 'tfidf')
|
||||||
|
|
||||||
|
|
||||||
|
####### Initialize and train a topic model
|
||||||
|
|
||||||
|
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
|
||||||
|
|
||||||
|
model.fit(doc_term_matrix)
|
||||||
|
|
||||||
|
doc_topic_matrix = model.transform(doc_term_matrix)
|
||||||
|
|
||||||
|
for topic_idx, top_terms in model.top_topic_terms(id_to_term, top_n=top_topic_words, weights=True):
|
||||||
|
logprint('{0}: {1}'.format(topic_idx, str(top_terms)))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
####################### termite plot ###################################################################
|
||||||
|
|
||||||
|
|
||||||
|
draw1.termite_plot(model,doc_term_matrix, id_to_term,
|
||||||
|
|
||||||
|
n_terms=n_terms,
|
||||||
|
sort_terms_by=sort_terms_by,
|
||||||
|
rank_terms_by=rank_terms_by + '_weight',
|
||||||
|
|
||||||
|
save=FILEPATH + "results/{}.png".format(topicModel))
|
||||||
|
|
||||||
|
end = time.time()
|
||||||
|
logprint("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def create_ticket2label_dict(ticket2chunk_dict,corpus):
|
||||||
|
"""
|
||||||
|
Creates a dictionary to map a TicketNumber to a label
|
||||||
|
:param ticket2chunk_dict: e.g. { TicketNumber : KB_entries }
|
||||||
|
:return: {TicketNumber : label }
|
||||||
|
"""
|
||||||
|
|
||||||
|
labelist = ticket2chunk_dict.values()
|
||||||
|
labelist = flatten(labelist)
|
||||||
|
|
||||||
|
labeldict = create_labeldict(labelist, min_label_freq=1, add_default_label=True)
|
||||||
|
|
||||||
|
ticket2label = {}
|
||||||
|
for doc in corpus:
|
||||||
|
ticketID = doc.metadata["TicketNumber"]
|
||||||
|
|
||||||
|
keywords = ticket2chunk_dict.get(ticketID, ['DEFAULT'])
|
||||||
|
|
||||||
|
label = ""
|
||||||
|
for kw in keywords:
|
||||||
|
label = label + str(labeldict.get(normalize_str(str(kw)), labeldict['DEFAULT'])) + " "
|
||||||
|
|
||||||
|
ticket2label.update({ticketID: label})
|
||||||
|
|
||||||
|
return ticket2label
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def create_labeldict(labelist, min_label_freq=1, add_default_label=True):
|
||||||
|
|
||||||
|
# nur die x häufigsten labels benutzen
|
||||||
|
labelist = [l for l in labelist if labelist.count(l) >= min_label_freq]
|
||||||
|
|
||||||
|
in_labelist_ = {k: labelist.count(k) for k in labelist} # { label1 : 3 , label2 : 5, label3 : 1 }
|
||||||
|
labelist = sort_dictionary(in_labelist_) # [ (label3, 1), (label1, 3), (label2, 5) ]
|
||||||
|
labelist.reverse() # [ (label2, 5), (label1, 3), (label3, 1) ]
|
||||||
|
labeldict = {elem[0]: i for i, elem in enumerate(labelist)} # { label2 : 0, label1 : 1 , label3 : 2 }
|
||||||
|
if add_default_label:
|
||||||
|
if 'DEFAULT' not in labeldict.keys():
|
||||||
|
labeldict.update({'DEFAULT': len(labelist)}) # { label2 : 0, label1 : 1 , label3 : 2 , DEFAULT : 3 }
|
||||||
|
return labeldict
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#todo
|
||||||
|
def jgibbsLLDAv2(labeled_lines_path,ticket2kbs_dict, cleaned_corpus, path2save_results, top_topic_words=7):
|
||||||
|
|
||||||
|
ticket2label_dict = create_ticket2label_dict(ticket2kbs_dict, cleaned_corpus)
|
||||||
|
|
||||||
|
# reduce ticket2label_dict
|
||||||
|
labeldict = {}
|
||||||
|
label_list = list(set(ticket2label_dict.values()))
|
||||||
|
lbl_dict = {elem : i for i,elem in enumerate(label_list)}
|
||||||
|
|
||||||
|
labeldict = {k : lbl_dict[v] for k,v in ticket2label_dict.items()}
|
||||||
|
labeldict.update({"DEFAULT" : len(labeldict)})
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def gen_lines_from_labeled_lines(input,ticket2label_dict):
|
||||||
|
|
||||||
|
line_gen = textacy.fileio.read_file_lines(input)
|
||||||
|
|
||||||
|
for line in line_gen:
|
||||||
|
label = re.findall(r'\[(.*?)\]',line)
|
||||||
|
|
||||||
|
new_label = "[ "
|
||||||
|
for lbl in label:
|
||||||
|
new_label = new_label + str(ticket2label_dict.get(str(lbl),"")).strip() + " "
|
||||||
|
|
||||||
|
new_label = new_label + "] "
|
||||||
|
result = new_label + str(line.rpartition("]")[2])
|
||||||
|
|
||||||
|
# new_label = str([ticket2label_dict.get(str(lbl),"") for lbl in label])
|
||||||
|
|
||||||
|
# result = "[ " + new_label + " ] " + line.rpartition("]")[2]
|
||||||
|
#print(result)
|
||||||
|
|
||||||
|
yield result
|
||||||
|
|
||||||
|
|
||||||
|
labeldict_rev = {v: k for k, v in labeldict.items()}
|
||||||
|
|
||||||
|
#line_gen = gen_lines_from_labeled_lines(labeled_lines_path,ticket2label_dict)
|
||||||
|
line_gen = gen_lines_from_labeled_lines(labeled_lines_path,labeldict)
|
||||||
|
|
||||||
|
|
||||||
|
jgibbsLLDA_root = FILEPATH + "java_LabledLDA/"
|
||||||
|
LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
|
||||||
|
|
||||||
|
|
||||||
|
textacy.fileio.write_file_lines(line_gen, filepath=LLDA_filepath)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# wait for file to exist
|
||||||
|
while not os.path.exists(LLDA_filepath):
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# run JGibbsLLDA file
|
||||||
|
|
||||||
|
n_topics = len(labeldict) #+1 #default-topic
|
||||||
|
|
||||||
|
FNULL = open(os.devnull, 'w') # supress output
|
||||||
|
cmd_jgibbs_java = ["java", "-cp",
|
||||||
|
"{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format(
|
||||||
|
jgibbsLLDA_root),
|
||||||
|
"jgibblda.LDA", "-est", "-dir", "{0}models/tickets".format(jgibbsLLDA_root), "-dfile",
|
||||||
|
"tickets.gz",
|
||||||
|
"-twords", str(top_topic_words), "-ntopics", str(n_topics)]
|
||||||
|
subprocess.call(cmd_jgibbs_java, stdout=FNULL)
|
||||||
|
|
||||||
|
|
||||||
|
# ANMERKUNG: Dateien sind versteckt. zu finden in models/
|
||||||
|
cmd_gzip = ["gzip", "-dc", "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)]
|
||||||
|
output = subprocess.check_output(cmd_gzip).decode("utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
topic_regex = re.compile(r'Topic [0-9]*')
|
||||||
|
|
||||||
|
#####################################
|
||||||
|
# todo save results in file aufgrund von results
|
||||||
|
result = []
|
||||||
|
|
||||||
|
for line in output.splitlines():
|
||||||
|
findall = topic_regex.findall(line)
|
||||||
|
if len(findall) != 0:
|
||||||
|
try:
|
||||||
|
index = int(findall[0].split()[1])
|
||||||
|
result.append("Topic {} {}:".format(index, str(ticket2kbs_dict[labeldict_rev[index]])))
|
||||||
|
|
||||||
|
except:
|
||||||
|
result.append(line)
|
||||||
|
|
||||||
|
else:
|
||||||
|
result.append(line)
|
||||||
|
|
||||||
|
textacy.fileio.write_file_lines(result, path2save_results+".txt")
|
||||||
|
#####################################
|
||||||
|
|
||||||
|
results = []
|
||||||
|
res_dict = {}
|
||||||
|
count =0
|
||||||
|
for line in output.splitlines():
|
||||||
|
|
||||||
|
findall = topic_regex.findall(line)
|
||||||
|
|
||||||
|
if len(findall) != 0:
|
||||||
|
|
||||||
|
if len(res_dict) != 0:
|
||||||
|
results.append(res_dict) #vorheriges an die liste ran (ist ja dann fertig)
|
||||||
|
|
||||||
|
index = int(findall[0].split()[1])
|
||||||
|
|
||||||
|
res_dict = {index : str(labeldict_rev[index]) }
|
||||||
|
|
||||||
|
else:
|
||||||
|
splitted = line.split()
|
||||||
|
res_dict[splitted[0]] = float(splitted[1])
|
||||||
|
|
||||||
|
if len(res_dict) != 0:
|
||||||
|
results.append(res_dict) # letzes an die liste ran
|
||||||
|
|
||||||
|
|
||||||
|
# every term in the resulsts to a list
|
||||||
|
|
||||||
|
terms=[]
|
||||||
|
for res in results:
|
||||||
|
for key,value in res.items():
|
||||||
|
if not isinstance(key, int) and not key in terms:
|
||||||
|
terms.append(key)
|
||||||
|
|
||||||
|
term2id = {t:i for i,t in enumerate(terms)} #and to dict
|
||||||
|
|
||||||
|
################# termite plot #####################################################################
|
||||||
|
topic_labels = list(range(len(labeldict)))
|
||||||
|
term_labels = list(range(len(term2id))) #tuple([key for key in term2id.keys()])
|
||||||
|
|
||||||
|
|
||||||
|
term_topic_weights = np.zeros((len(term2id),len(topic_labels)))
|
||||||
|
|
||||||
|
for i,res in enumerate(results):
|
||||||
|
|
||||||
|
for key,value in res.items():
|
||||||
|
|
||||||
|
if not isinstance(key, int):
|
||||||
|
term_topic_weights[term2id[key]][i] = value
|
||||||
|
term_labels[term2id[key]] = key
|
||||||
|
else:
|
||||||
|
topic_labels[i] = labeldict_rev[key]
|
||||||
|
|
||||||
|
draw.draw_termite(
|
||||||
|
term_topic_weights, topic_labels, term_labels, save=path2save_results+".png")
|
||||||
|
|
||||||
|
draw.draw_termite(
|
||||||
|
term_topic_weights, topic_labels, term_labels, save=path2save_results+"_spaced.png",pow_x=0.78,pow_y=0.87)
|
||||||
|
|
||||||
|
# save labeldict
|
||||||
|
labeldict_path = path2save_results + "_labeldict.json"
|
||||||
|
with open(labeldict_path, 'w') as file:
|
||||||
|
file.write(json.dumps(labeldict))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -259,15 +514,21 @@ def jgibbsLLDA_category(corpus, path2save_results, top_topic_words=7):
|
||||||
logprint("start Category-LLDA:")
|
logprint("start Category-LLDA:")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# labeldict ############################################################################################
|
||||||
|
|
||||||
|
|
||||||
# build dictionary of ticketcategories
|
# build dictionary of ticketcategories
|
||||||
labelist = []
|
labelist = []
|
||||||
for doc in corpus:
|
for doc in corpus:
|
||||||
|
|
||||||
category = normalize(doc.metadata["categoryName"])
|
category = normalize_str(doc.metadata["categoryName"])
|
||||||
labelist.append(category)
|
labelist.append(category)
|
||||||
|
|
||||||
# frage nur die x häufigsten labels benutzen, rest raus?
|
x = 50 # frage nur die x häufigsten labels benutzen, rest raus?
|
||||||
labelist = [l for l in labelist if labelist.count(l) > 50 ]
|
|
||||||
|
|
||||||
|
labelist = [l for l in labelist if labelist.count(l) > x ]
|
||||||
|
|
||||||
in_labelist_ = {k: labelist.count(k) for k in labelist}
|
in_labelist_ = {k: labelist.count(k) for k in labelist}
|
||||||
labelist = sort_dictionary(in_labelist_)
|
labelist = sort_dictionary(in_labelist_)
|
||||||
|
@ -290,13 +551,13 @@ def jgibbsLLDA_category(corpus, path2save_results, top_topic_words=7):
|
||||||
|
|
||||||
labeldict.update({'DEFAULT': len(labelist)})
|
labeldict.update({'DEFAULT': len(labelist)})
|
||||||
|
|
||||||
|
##############################################################################################
|
||||||
|
|
||||||
def gen_cat_lines(textacyCorpus, labeldict):
|
def gen_cat_lines(textacyCorpus, labeldict):
|
||||||
""" generates [topic1, topic2....] tok1 tok2 tok3 out of corpi"""
|
""" generates [topic1, topic2....] tok1 tok2 tok3 out of corpi"""
|
||||||
|
|
||||||
for doc in textacyCorpus:
|
for doc in textacyCorpus:
|
||||||
label = labeldict.get(normalize(doc.metadata["categoryName"]), labeldict['DEFAULT'])
|
label = labeldict.get(normalize_str(doc.metadata["categoryName"]), labeldict['DEFAULT'])
|
||||||
|
|
||||||
|
|
||||||
if label is not 'DEFAULT':
|
if label is not 'DEFAULT':
|
||||||
|
@ -324,6 +585,11 @@ def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words = 7, kb_keywords=Fa
|
||||||
logprint("")
|
logprint("")
|
||||||
logprint("start {}-LLDA:".format("Keyword" if kb_keywords else "Subject"))
|
logprint("start {}-LLDA:".format("Keyword" if kb_keywords else "Subject"))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# labeldict ############################################################################################
|
||||||
|
|
||||||
|
|
||||||
# ticket2kb_dict
|
# ticket2kb_dict
|
||||||
|
|
||||||
kb2ticket_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB2Ticket_2017-09-13.csv", delimiter=";")
|
kb2ticket_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB2Ticket_2017-09-13.csv", delimiter=";")
|
||||||
|
@ -358,7 +624,7 @@ def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words = 7, kb_keywords=Fa
|
||||||
subject = line[1]
|
subject = line[1]
|
||||||
|
|
||||||
keywords = line[2]
|
keywords = line[2]
|
||||||
keywords_list = [normalize(x) for x in str(keywords).split(",")]
|
keywords_list = [normalize_str(x) for x in str(keywords).split(",")]
|
||||||
|
|
||||||
if kb_id not in kb2keywords_dict.keys():
|
if kb_id not in kb2keywords_dict.keys():
|
||||||
kb2keywords_dict[kb_id] = []
|
kb2keywords_dict[kb_id] = []
|
||||||
|
@ -406,7 +672,7 @@ def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words = 7, kb_keywords=Fa
|
||||||
keywords = kb2keywords_dict.get(kb_id, None)
|
keywords = kb2keywords_dict.get(kb_id, None)
|
||||||
|
|
||||||
if keywords and kb_id:
|
if keywords and kb_id:
|
||||||
used_keywords.append(list(map(normalize,keywords)))
|
used_keywords.append(list(map(normalize_str, keywords)))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -418,6 +684,7 @@ def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words = 7, kb_keywords=Fa
|
||||||
|
|
||||||
labeldict = {k: v for v, k in enumerate(labelist)}
|
labeldict = {k: v for v, k in enumerate(labelist)}
|
||||||
|
|
||||||
|
##############################################################################################
|
||||||
|
|
||||||
|
|
||||||
def gen_KB_lines(textacyCorpus, labeldict, ticket2kb_dict, kb2keywords_dict):
|
def gen_KB_lines(textacyCorpus, labeldict, ticket2kb_dict, kb2keywords_dict):
|
||||||
|
@ -433,7 +700,7 @@ def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words = 7, kb_keywords=Fa
|
||||||
|
|
||||||
label = ""
|
label = ""
|
||||||
for kw in keywords:
|
for kw in keywords:
|
||||||
label = label + str(labeldict.get(normalize(str(kw)), len(labeldict))) + " "
|
label = label + str(labeldict.get(normalize_str(str(kw)), len(labeldict))) + " "
|
||||||
|
|
||||||
yield "[ " + label + "] " + doc.text
|
yield "[ " + label + "] " + doc.text
|
||||||
|
|
||||||
|
@ -451,7 +718,6 @@ def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words = 7, kb_keywords=Fa
|
||||||
logprint("\n\n\nTime Elapsed {1}-LLDA :{0} min\n\n".format((end - start) / 60,"Keyword" if kb_keywords else "Subject"))
|
logprint("\n\n\nTime Elapsed {1}-LLDA :{0} min\n\n".format((end - start) / 60,"Keyword" if kb_keywords else "Subject"))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7):
|
def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7):
|
||||||
|
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
@ -461,6 +727,10 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7):
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# labeldict ############################################################################################
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# kb2keywords_dict / kb2subjects_dict --> {str : [str]}
|
# kb2keywords_dict / kb2subjects_dict --> {str : [str]}
|
||||||
|
@ -476,9 +746,9 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7):
|
||||||
kb_id = line[0]
|
kb_id = line[0]
|
||||||
|
|
||||||
|
|
||||||
subject = normalize(line[1])
|
subject = normalize_str(line[1])
|
||||||
|
|
||||||
keywords = [normalize(x) for x in str(line[2]).split(",")]
|
keywords = [normalize_str(x) for x in str(line[2]).split(",")]
|
||||||
|
|
||||||
|
|
||||||
if kb_id not in kb2keywords_dict.keys():
|
if kb_id not in kb2keywords_dict.keys():
|
||||||
|
@ -488,9 +758,9 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7):
|
||||||
|
|
||||||
|
|
||||||
if kb_id not in kb2subjects_dict.keys():
|
if kb_id not in kb2subjects_dict.keys():
|
||||||
kb2subjects_dict[kb_id] = [normalize(subject) if subject != [''] else "DEFAULT"]
|
kb2subjects_dict[kb_id] = [normalize_str(subject) if subject != [''] else "DEFAULT"]
|
||||||
else:
|
else:
|
||||||
kb2subjects_dict[kb_id].append(normalize(subject))
|
kb2subjects_dict[kb_id].append(normalize_str(subject))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -586,8 +856,7 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7):
|
||||||
labelist = list(set(labelist))
|
labelist = list(set(labelist))
|
||||||
labeldict = {k: v for v, k in enumerate(labelist)}
|
labeldict = {k: v for v, k in enumerate(labelist)}
|
||||||
|
|
||||||
|
##############################################################################################
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def gen_key_lines(textacyCorpus, labeldict, ticket2keywords_dict):
|
def gen_key_lines(textacyCorpus, labeldict, ticket2keywords_dict):
|
||||||
|
@ -601,7 +870,7 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7):
|
||||||
|
|
||||||
label = ""
|
label = ""
|
||||||
for kw in keywords:
|
for kw in keywords:
|
||||||
label = label + str(labeldict.get(normalize(str(kw)), labeldict['DEFAULT'])) + " "
|
label = label + str(labeldict.get(normalize_str(str(kw)), labeldict['DEFAULT'])) + " "
|
||||||
|
|
||||||
yield "[ " + label + "] " + doc.text
|
yield "[ " + label + "] " + doc.text
|
||||||
|
|
||||||
|
@ -642,10 +911,9 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7):
|
||||||
def load_from_labled_lines(path):
|
def load_from_labled_lines(path):
|
||||||
path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/pre_labled_lines_wo_lemma_061217.txt"
|
path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/pre_labled_lines_wo_lemma_061217.txt"
|
||||||
|
|
||||||
#idee plan
|
#idee
|
||||||
# clean laden, pre laden
|
# clean laden, pre laden
|
||||||
# unigramme und num/wort-bigramme doc-term # frage wie geht llda mit bigrammen um? idee bigramme mit _ verbinden
|
# unigramme und num/wort-bigramme doc-term # frage wie geht llda mit bigrammen um? idee bigramme mit _ verbinden # nimm nur ngrams wo midn. ein token in pre vorkommt
|
||||||
# nimm nur ngrams wo midn. ein token in pre vorkommt
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -415,7 +415,7 @@ def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words=7, kb_keywords=Fals
|
||||||
keywords = kb2keywords_dict.get(kb_number, None)
|
keywords = kb2keywords_dict.get(kb_number, None)
|
||||||
|
|
||||||
if keywords and kb_number:
|
if keywords and kb_number:
|
||||||
used_keywords.append(list(map(normalize,keywords)))
|
used_keywords.append(list(map(normalize_str, keywords)))
|
||||||
|
|
||||||
kb_entries_used = (len(list(set([kb for kb in ticket2kb_dict.values()]))))
|
kb_entries_used = (len(list(set([kb for kb in ticket2kb_dict.values()]))))
|
||||||
print("kb_entries_used: {}".format(kb_entries_used))
|
print("kb_entries_used: {}".format(kb_entries_used))
|
||||||
|
@ -447,7 +447,7 @@ def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words=7, kb_keywords=Fals
|
||||||
|
|
||||||
label = ""
|
label = ""
|
||||||
for kw in keywords:
|
for kw in keywords:
|
||||||
label = label + str(labeldict.get( normalize(str(kw)) , len(labeldict))) + " "
|
label = label + str(labeldict.get(normalize_str(str(kw)), len(labeldict))) + " "
|
||||||
|
|
||||||
yield "[ " + label + "] " + doc.text
|
yield "[ " + label + "] " + doc.text
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue