termiteplot für lda
This commit is contained in:
parent
3137dc6e54
commit
6ea03b2f65
|
@ -29,7 +29,7 @@ global REGEX_SPECIALCHAR
|
|||
global WORDS
|
||||
|
||||
|
||||
REGEX_SPECIALCHAR = r'[`\-=~%^&*()_+\[\]{};\'\\:"|</>]' #+r',.'
|
||||
REGEX_SPECIALCHAR = r'[`\=~%^&*()_+\[\]{};\'"|</>]' #+r',.-\\:' #+r',.?!'
|
||||
|
||||
WORDS= {}
|
||||
|
||||
|
@ -99,7 +99,7 @@ def clean(stringstream,autocorrect=False):
|
|||
string = textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC')
|
||||
|
||||
# seperate_words_on_regex:
|
||||
string = " ".join(re.compile(REGEX_SPECIALCHAR).split(string)) #frage ,.?!
|
||||
string = " ".join(re.compile(REGEX_SPECIALCHAR).split(string))
|
||||
|
||||
#normalize whitespace
|
||||
string = textacy.preprocess.normalize_whitespace(string)
|
||||
|
|
|
@ -41,8 +41,8 @@ filename=topicModelTickets.log
|
|||
|
||||
|
||||
[de_corpus]
|
||||
input=M42-Export/Tickets_small.csv
|
||||
#input=M42-Export/de_tickets.csv
|
||||
#input=M42-Export/Tickets_small.csv
|
||||
input=M42-Export/de_tickets.csv
|
||||
|
||||
path=corpi/
|
||||
|
||||
|
@ -62,12 +62,11 @@ metaliste=TicketNumber,Subject,CreatedDate,categoryName,Impact,Urgency,BenutzerI
|
|||
|
||||
[preprocessing]
|
||||
|
||||
ents2keep=WORK_OF_ART,ORG,PRODUCT,LOC
|
||||
#ents2keep=WORK_OF_ART,ORG,PRODUCT,LOC
|
||||
|
||||
custom_words=geehrt,dame,herr,hilfe,problem,lauten,bedanken,voraus,hallo,gerne,freundlich,fragen,fehler,bitten,ehre,lieb,helfen,versuchen,unbestimmt,woche,tadelos,klappen,mittlerweile,bekommen,erreichbar,gruss,auffahren,vorgang,hinweis,institut,universitaet,name,gruss,id,erfolg,mail,folge,nummer,team,fakultaet,email,absender,tu,versenden,vorname,message,service,strasse,prozess,portal,raum,personal,moeglichkeit,fremd,wende,rueckfrage,stehen,verfuegung,funktionieren,kollege,pruefen,hoffen
|
||||
custom_words=geehrt,dr,not,frage,betreff,gerne,dame,herr,frau,hilfe,moeglichkeit,beste,freuen,voraus,problem,lauten,bedanken,voraus,hallo,gerne,freundlich,fragen,fehler,bitten,ehre,lieb,liebe,gruesse,helfen,versuchen,unbestimmt,woche,tadelos,klappen,mittlerweile,bekommen,erreichbar,gruss,auffahren,vorgang,hinweis,name,gruss,id,erfolg,folge,team,absender,versenden,vorname,strasse,prozess,portal,moeglichkeit,fremd,wende,rueckfrage,stehen,verfuegung,funktionieren,pruefen,hoffen,ok
|
||||
|
||||
|
||||
#lemmatize=True
|
||||
|
||||
|
||||
[topicmodeling]
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,622 @@
|
|||
a
|
||||
ab
|
||||
aber
|
||||
ach
|
||||
acht
|
||||
achte
|
||||
trotz
|
||||
achten
|
||||
achter
|
||||
achtes
|
||||
ag
|
||||
alle
|
||||
allein
|
||||
allem
|
||||
allen
|
||||
aller
|
||||
allerdings
|
||||
alles
|
||||
allgemeinen
|
||||
als
|
||||
also
|
||||
am
|
||||
an
|
||||
ander
|
||||
andere
|
||||
anderem
|
||||
anderen
|
||||
anderer
|
||||
anderes
|
||||
anderm
|
||||
andern
|
||||
anderr
|
||||
anders
|
||||
au
|
||||
auch
|
||||
auf
|
||||
aus
|
||||
ausser
|
||||
ausserdem
|
||||
außer
|
||||
außerdem
|
||||
b
|
||||
bald
|
||||
bei
|
||||
beide
|
||||
beiden
|
||||
beim
|
||||
beispiel
|
||||
bekannt
|
||||
bereits
|
||||
besonders
|
||||
besser
|
||||
besten
|
||||
bin
|
||||
bis
|
||||
bisher
|
||||
bist
|
||||
c
|
||||
d
|
||||
d.h
|
||||
da
|
||||
dabei
|
||||
dadurch
|
||||
dafür
|
||||
dagegen
|
||||
daher
|
||||
dahin
|
||||
dahinter
|
||||
damals
|
||||
damit
|
||||
danach
|
||||
daneben
|
||||
dank
|
||||
dann
|
||||
daran
|
||||
darauf
|
||||
daraus
|
||||
darf
|
||||
darfst
|
||||
darin
|
||||
darum
|
||||
darunter
|
||||
darüber
|
||||
das
|
||||
dasein
|
||||
daselbst
|
||||
dass
|
||||
dasselbe
|
||||
davon
|
||||
davor
|
||||
dazu
|
||||
dazwischen
|
||||
daß
|
||||
dein
|
||||
deine
|
||||
deinem
|
||||
deinen
|
||||
deiner
|
||||
deines
|
||||
dem
|
||||
dementsprechend
|
||||
demgegenüber
|
||||
demgemäss
|
||||
demgemäß
|
||||
demselben
|
||||
demzufolge
|
||||
den
|
||||
denen
|
||||
denn
|
||||
denselben
|
||||
der
|
||||
deren
|
||||
derer
|
||||
derjenige
|
||||
derjenigen
|
||||
dermassen
|
||||
dermaßen
|
||||
derselbe
|
||||
derselben
|
||||
des
|
||||
deshalb
|
||||
desselben
|
||||
dessen
|
||||
deswegen
|
||||
dich
|
||||
die
|
||||
diejenige
|
||||
diejenigen
|
||||
dies
|
||||
diese
|
||||
dieselbe
|
||||
dieselben
|
||||
diesem
|
||||
diesen
|
||||
dieser
|
||||
dieses
|
||||
dir
|
||||
doch
|
||||
dort
|
||||
drei
|
||||
drin
|
||||
dritte
|
||||
dritten
|
||||
dritter
|
||||
drittes
|
||||
du
|
||||
durch
|
||||
durchaus
|
||||
durfte
|
||||
durften
|
||||
dürfen
|
||||
dürft
|
||||
e
|
||||
eben
|
||||
ebenso
|
||||
ehrlich
|
||||
ei
|
||||
ei,
|
||||
eigen
|
||||
eigene
|
||||
eigenen
|
||||
eigener
|
||||
eigenes
|
||||
ein
|
||||
einander
|
||||
eine
|
||||
einem
|
||||
einen
|
||||
einer
|
||||
eines
|
||||
einig
|
||||
einige
|
||||
einigem
|
||||
einigen
|
||||
einiger
|
||||
einiges
|
||||
einmal
|
||||
eins
|
||||
elf
|
||||
en
|
||||
ende
|
||||
endlich
|
||||
entweder
|
||||
er
|
||||
ernst
|
||||
erst
|
||||
erste
|
||||
ersten
|
||||
erster
|
||||
erstes
|
||||
es
|
||||
etwa
|
||||
etwas
|
||||
euch
|
||||
euer
|
||||
eure
|
||||
eurem
|
||||
euren
|
||||
eurer
|
||||
eures
|
||||
f
|
||||
folgende
|
||||
früher
|
||||
fünf
|
||||
fünfte
|
||||
fünften
|
||||
fünfter
|
||||
fünftes
|
||||
für
|
||||
g
|
||||
gab
|
||||
ganz
|
||||
ganze
|
||||
ganzen
|
||||
ganzer
|
||||
ganzes
|
||||
gar
|
||||
gedurft
|
||||
gegen
|
||||
gegenüber
|
||||
gehabt
|
||||
gehen
|
||||
geht
|
||||
gekannt
|
||||
gekonnt
|
||||
gemacht
|
||||
gemocht
|
||||
gemusst
|
||||
genug
|
||||
gerade
|
||||
gern
|
||||
gesagt
|
||||
geschweige
|
||||
gewesen
|
||||
gewollt
|
||||
geworden
|
||||
gibt
|
||||
ging
|
||||
gleich
|
||||
gott
|
||||
gross
|
||||
grosse
|
||||
grossen
|
||||
grosser
|
||||
grosses
|
||||
groß
|
||||
große
|
||||
großen
|
||||
großer
|
||||
großes
|
||||
gut
|
||||
gute
|
||||
guter
|
||||
gutes
|
||||
h
|
||||
hab
|
||||
habe
|
||||
haben
|
||||
habt
|
||||
hast
|
||||
hat
|
||||
hatte
|
||||
hatten
|
||||
hattest
|
||||
hattet
|
||||
heisst
|
||||
her
|
||||
heute
|
||||
hier
|
||||
hin
|
||||
hinter
|
||||
hoch
|
||||
hätte
|
||||
hätten
|
||||
i
|
||||
ich
|
||||
ihm
|
||||
ihn
|
||||
ihnen
|
||||
ihr
|
||||
ihre
|
||||
ihrem
|
||||
ihren
|
||||
ihrer
|
||||
ihres
|
||||
im
|
||||
immer
|
||||
in
|
||||
indem
|
||||
infolgedessen
|
||||
ins
|
||||
irgend
|
||||
ist
|
||||
j
|
||||
ja
|
||||
jahr
|
||||
jahre
|
||||
jahren
|
||||
je
|
||||
jede
|
||||
jedem
|
||||
jeden
|
||||
jeder
|
||||
jedermann
|
||||
jedermanns
|
||||
jedes
|
||||
jedoch
|
||||
jemand
|
||||
jemandem
|
||||
jemanden
|
||||
jene
|
||||
jenem
|
||||
jenen
|
||||
jener
|
||||
jenes
|
||||
jetzt
|
||||
k
|
||||
kam
|
||||
kann
|
||||
kannst
|
||||
kaum
|
||||
kein
|
||||
keine
|
||||
keinem
|
||||
keinen
|
||||
keiner
|
||||
keines
|
||||
kleine
|
||||
kleinen
|
||||
kleiner
|
||||
kleines
|
||||
kommen
|
||||
kommt
|
||||
konnte
|
||||
konnten
|
||||
kurz
|
||||
können
|
||||
könnt
|
||||
könnte
|
||||
l
|
||||
lang
|
||||
lange
|
||||
leicht
|
||||
leide
|
||||
lieber
|
||||
los
|
||||
m
|
||||
machen
|
||||
macht
|
||||
machte
|
||||
mag
|
||||
magst
|
||||
mahn
|
||||
mal
|
||||
man
|
||||
manche
|
||||
manchem
|
||||
manchen
|
||||
mancher
|
||||
manches
|
||||
mann
|
||||
mehr
|
||||
mein
|
||||
meine
|
||||
meinem
|
||||
meinen
|
||||
meiner
|
||||
meines
|
||||
mensch
|
||||
menschen
|
||||
mich
|
||||
mir
|
||||
mit
|
||||
mittel
|
||||
mochte
|
||||
mochten
|
||||
morgen
|
||||
muss
|
||||
musst
|
||||
musste
|
||||
mussten
|
||||
muß
|
||||
mußt
|
||||
möchte
|
||||
mögen
|
||||
möglich
|
||||
mögt
|
||||
müssen
|
||||
müsst
|
||||
müßt
|
||||
n
|
||||
na
|
||||
nach
|
||||
nachdem
|
||||
nahm
|
||||
natürlich
|
||||
neben
|
||||
nein
|
||||
neue
|
||||
neuen
|
||||
neun
|
||||
neunte
|
||||
neunten
|
||||
neunter
|
||||
neuntes
|
||||
nicht
|
||||
nichts
|
||||
nie
|
||||
niemand
|
||||
niemandem
|
||||
niemanden
|
||||
noch
|
||||
nun
|
||||
nur
|
||||
o
|
||||
ob
|
||||
oben
|
||||
oder
|
||||
offen
|
||||
oft
|
||||
ohne
|
||||
ordnung
|
||||
p
|
||||
q
|
||||
r
|
||||
recht
|
||||
rechte
|
||||
rechten
|
||||
rechter
|
||||
rechtes
|
||||
richtig
|
||||
rund
|
||||
s
|
||||
sa
|
||||
sache
|
||||
sagt
|
||||
sagte
|
||||
sah
|
||||
satt
|
||||
schlecht
|
||||
schluss
|
||||
schon
|
||||
sechs
|
||||
sechste
|
||||
sechsten
|
||||
sechster
|
||||
sechstes
|
||||
sehr
|
||||
sei
|
||||
seid
|
||||
seien
|
||||
sein
|
||||
seine
|
||||
seinem
|
||||
seinen
|
||||
seiner
|
||||
seines
|
||||
seit
|
||||
seitdem
|
||||
selbst
|
||||
sich
|
||||
sie
|
||||
sieben
|
||||
siebente
|
||||
siebenten
|
||||
siebenter
|
||||
siebentes
|
||||
sind
|
||||
so
|
||||
solang
|
||||
solche
|
||||
solchem
|
||||
solchen
|
||||
solcher
|
||||
solches
|
||||
soll
|
||||
sollen
|
||||
sollst
|
||||
sollt
|
||||
sollte
|
||||
sollten
|
||||
sondern
|
||||
sonst
|
||||
soweit
|
||||
sowie
|
||||
später
|
||||
startseite
|
||||
statt
|
||||
steht
|
||||
suche
|
||||
t
|
||||
tag
|
||||
tage
|
||||
tagen
|
||||
tat
|
||||
teil
|
||||
tel
|
||||
tritt
|
||||
trotzdem
|
||||
tun
|
||||
u
|
||||
uhr
|
||||
um
|
||||
und
|
||||
und?
|
||||
uns
|
||||
unse
|
||||
unsem
|
||||
unsen
|
||||
unser
|
||||
unsere
|
||||
unserer
|
||||
unses
|
||||
unter
|
||||
v
|
||||
vergangenen
|
||||
viel
|
||||
viele
|
||||
vielem
|
||||
vielen
|
||||
vielleicht
|
||||
vier
|
||||
vierte
|
||||
vierten
|
||||
vierter
|
||||
viertes
|
||||
vom
|
||||
von
|
||||
vor
|
||||
w
|
||||
wahr?
|
||||
wann
|
||||
war
|
||||
waren
|
||||
warst
|
||||
wart
|
||||
warum
|
||||
was
|
||||
weg
|
||||
wegen
|
||||
weil
|
||||
weit
|
||||
weiter
|
||||
weitere
|
||||
weiteren
|
||||
weiteres
|
||||
welche
|
||||
welchem
|
||||
welchen
|
||||
welcher
|
||||
welches
|
||||
wem
|
||||
wen
|
||||
wenig
|
||||
wenige
|
||||
weniger
|
||||
weniges
|
||||
wenigstens
|
||||
wenn
|
||||
wer
|
||||
werde
|
||||
werden
|
||||
werdet
|
||||
weshalb
|
||||
wessen
|
||||
wie
|
||||
wieder
|
||||
wieso
|
||||
will
|
||||
willst
|
||||
wir
|
||||
wird
|
||||
wirklich
|
||||
wirst
|
||||
wissen
|
||||
wo
|
||||
woher
|
||||
wohin
|
||||
wohl
|
||||
wollen
|
||||
wollt
|
||||
wollte
|
||||
wollten
|
||||
worden
|
||||
wurde
|
||||
wurden
|
||||
während
|
||||
währenddem
|
||||
währenddessen
|
||||
wäre
|
||||
würde
|
||||
würden
|
||||
x
|
||||
y
|
||||
z
|
||||
z.b
|
||||
zehn
|
||||
zehnte
|
||||
zehnten
|
||||
zehnter
|
||||
zehntes
|
||||
zeit
|
||||
zu
|
||||
zuerst
|
||||
zugleich
|
||||
zum
|
||||
zunächst
|
||||
zur
|
||||
zurück
|
||||
zusammen
|
||||
zwanzig
|
||||
zwar
|
||||
zwei
|
||||
zweite
|
||||
zweiten
|
||||
zweiter
|
||||
zweites
|
||||
zwischen
|
||||
zwölf
|
||||
über
|
||||
überhaupt
|
||||
übrigens
|
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
14
main.py
14
main.py
|
@ -1,5 +1,6 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import matplotlib
|
||||
matplotlib.use('Agg')
|
||||
import time
|
||||
import corporization
|
||||
import preprocessing
|
||||
|
@ -21,12 +22,17 @@ logprint("")
|
|||
cleaning.main()
|
||||
logprint("")
|
||||
|
||||
preprocessing.main()
|
||||
preprocessing.main() # ~5h
|
||||
logprint("")
|
||||
|
||||
topicModeling.main(use_raw=False)
|
||||
#topicModeling.main(use_raw=False,algorithm="llda")
|
||||
logprint("")
|
||||
topicModeling.main(use_raw=True)
|
||||
|
||||
#topicModeling.main(use_raw=True)
|
||||
|
||||
topicModeling.main(use_raw=False,algorithm="lda")
|
||||
logprint("")
|
||||
|
||||
logprint("")
|
||||
|
||||
end = time.time()
|
||||
|
|
|
@ -201,10 +201,16 @@ def save_corpus(corpus, corpus_path, corpus_name):
|
|||
|
||||
# save meta
|
||||
metapath = corpus_path + corpus_name + "_meta.json"
|
||||
textacy.fileio.write_json_lines((doc.metadata for doc in corpus), metapath)
|
||||
|
||||
#meta_gen = (doc.metadata.update({"index": doc.corpus_index}) for doc in corpus)
|
||||
meta_gen = gen_meta(corpus)
|
||||
textacy.fileio.write_json_lines(meta_gen, metapath)
|
||||
|
||||
|
||||
def gen_meta(corpus):
|
||||
for doc in corpus:
|
||||
meta = doc.metadata
|
||||
meta.update({"index": doc.corpus_index})
|
||||
yield meta
|
||||
|
||||
|
||||
|
||||
|
@ -242,7 +248,7 @@ def load_corpus(corpus_path, corpus_name, lang="de"):
|
|||
contentpath = corpus_path + corpus_name + "_content.bin"
|
||||
plainpath = corpus_path + corpus_name + "_content.json"
|
||||
metapath = corpus_path + corpus_name + "_meta.json"
|
||||
|
||||
"""
|
||||
try:
|
||||
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
|
||||
metadata_stream = textacy.fileio.read_json_lines(metapath)
|
||||
|
@ -251,15 +257,18 @@ def load_corpus(corpus_path, corpus_name, lang="de"):
|
|||
corpus.add_doc(
|
||||
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
|
||||
except:
|
||||
# neu init!!
|
||||
corpus = textacy.Corpus(parser)
|
||||
"""
|
||||
# neu init!!
|
||||
#corpus = textacy.Corpus(parser)
|
||||
|
||||
plain_stream = textacy.fileio.read_json_lines(plainpath) # yields {int : str}
|
||||
metadata_stream = textacy.fileio.read_json_lines(metapath)
|
||||
plain_stream = textacy.fileio.read_json_lines(plainpath) # yields {int : str}
|
||||
metadata_stream = textacy.fileio.read_json_lines(metapath)
|
||||
|
||||
for plain, metadata in zip(plain_stream, metadata_stream):
|
||||
corpus.add_doc(
|
||||
textacy.Doc(plain["content"], lang=corpus.spacy_lang, metadata=metadata))
|
||||
for plain, metadata in zip(plain_stream, metadata_stream):
|
||||
if plain["index"] == metadata["index"]:
|
||||
corpus.add_doc(textacy.Doc(plain["content"], lang=corpus.spacy_lang, metadata=metadata))
|
||||
else:
|
||||
raise IndexError
|
||||
|
||||
|
||||
return corpus, corpus.spacy_lang
|
||||
|
|
|
@ -247,7 +247,7 @@ def processContentstream2(textstream, parser, token_filterlist=None):
|
|||
tokens = filterTokens(tokens, token_filterlist)
|
||||
|
||||
# post parse
|
||||
tokens = [postparse(tok) for tok in tokens] #todo informationsverlust!
|
||||
tokens = [postparse(tok) for tok in tokens] #todo informationsverlust von pos,tag etc.!
|
||||
|
||||
yield " ".join(tokens)
|
||||
|
||||
|
@ -415,7 +415,7 @@ def preprocessCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", print
|
|||
|
||||
|
||||
#save corpus as labled, plain text
|
||||
plainpath = FILEPATH + config.get("de_corpus", "path") + "labled_lines.txt"
|
||||
plainpath = FILEPATH + config.get("de_corpus", "path") + "pre_labled_lines.txt"
|
||||
textacy.fileio.write_file_lines(labledCorpiLines(corpus),filepath=plainpath )
|
||||
|
||||
return corpus
|
||||
|
@ -438,6 +438,8 @@ def main():
|
|||
NOUNS = load_obj(path2nouns_list)
|
||||
VORNAMEN = load_obj(path2firstnameslist)
|
||||
|
||||
custom_words = config.get("preprocessing","custom_words").split(",")
|
||||
|
||||
filter_tokens = [
|
||||
# removeENT(["PERSON"]),
|
||||
|
||||
|
@ -447,8 +449,8 @@ def main():
|
|||
|
||||
removePOS(["PUNCT", "SPACE", "NUM"]),
|
||||
|
||||
# removeWords(de_stop_words + custom_words),
|
||||
removeWords(DE_STOP_WORDS),
|
||||
removeWords(DE_STOP_WORDS + custom_words),
|
||||
#removeWords(DE_STOP_WORDS),
|
||||
|
||||
remove_long_words(),
|
||||
remove_short_words(),
|
||||
|
|
23
testra.py
23
testra.py
|
@ -22,11 +22,7 @@ from miscellaneous import *
|
|||
|
||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModeling.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_topicModeling.log &"
|
||||
|
||||
parser = spacy.load("de")
|
||||
|
||||
|
||||
|
||||
"""
|
||||
# load config
|
||||
config_ini = FILEPATH + "config.ini"
|
||||
|
||||
|
@ -71,10 +67,23 @@ corpi.add_texts(
|
|||
makecontent(testcontetn),
|
||||
makemeta(testmetda)
|
||||
)
|
||||
corpus_de_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/test/"
|
||||
rawCorpus_name = "de_test_ticket"
|
||||
print(corpi)
|
||||
|
||||
|
||||
save_corpus(corpi,corpus_path="/home/jannis.grundmann/PycharmProjects/topicModelingTickets/test",corpus_name="test")
|
||||
|
||||
bla = "uni mail account adresse woche falsch laufen schicken gerne januar betreff herr nachricht gruesse dezernat liebe datum freitag anfrage dienstag unicard karte abholen defekt bibliothek abholung dezember beantragung status gerne portal email nummer service id vorname prozess dez schauen eg rechner mitarbeiterin benutzerkonto oktober wissenschaftliche projekt fr download hilfskraft verantwortliche link dringend antrag schnelle arbeitsplatz november admin rahmen stand geschickt server outlook ordner bild konto postfach campus hi ueberpruefung sued beste daten freuen semester login benutzer gerne erstellen stelle frage system boss moeglichkeit student schoen spam alias geld vertrag juni ansprechpartner telefon raum einrichtung gebaeude telefonbuch abteilung element eintrag nutzer raum pc gerne lehrstuhl voraus fakultaet verfuegung herzliche drucker erreichen tlaptop kabel problem klaerung url adapter feedback koeln grundsaetzlich kaufmann problem fehler verbindung anhang meldung client netz netzwerk wenden funktionieren liebe mitarbeiter unterstuetzung aktuell herr benoetigt raumplanung gb weber vorab ueckmeldung software lizenz programm kurze urlaub gerne installation dankbar informieren team service problem loesung bestellung verlaengern verteiler alte aendern februar oeffnen update pdf browser notwendig fenster schulung beginn wege nord tkurs frage studierende personen teilnehmer standort gerne herunterladen voraus zusenden ews veranstaltung datei iso text umstellung absender message date html arbeit kaiser erfolgreich thema ablauf art at einfuehrung umfrage cloud zugang zugreifen montag probleme kollegin profil server handy web file ticket drucker einrichten senden nr mittwoch card mitteilen nrw kontakt mail fax universitaet it institut hardware hinweis fakultaet not strasse loeschen liste funktion auftrag zeitraum verwaltung angebot vorgehen entfernen moeglichkeit gefunden benutzername informatik gruppe eingabe nachname chemie dame b. angepasst name schoene abt post zukommen verlaengerung sommersemester fehlen namensaenderung auskunft tu dr prof pruefung herr namen fakultaet bereich lehrstuhl installieren buero ok anschluss maerz theologie notebook herr berechtigung master vorbeikommen passwort anmelden account hilfe helfen uniaccount anmeldung kennwort problem boss zugriff referat screenshot support laufwerk bildschirm super tastatur button auswaehlen"
|
||||
bla = bla.split()
|
||||
print(len(bla))
|
||||
print(len(set(bla)))
|
||||
print()
|
||||
|
||||
x = {'a':1, 'b': 2}
|
||||
y = {'b':10, 'c': 11}
|
||||
z = x.update(y)
|
||||
|
||||
print(x)
|
||||
|
||||
"""
|
||||
#save_corpusV2(corpi,corpus_path=corpus_de_path,corpus_name=rawCorpus_name)
|
||||
|
||||
#textacy.fileio.write_file_lines(corpus2Text(corpi), filepath=corpus_de_path+"plain.txt")
|
||||
|
|
178
topicModeling.py
178
topicModeling.py
|
@ -73,12 +73,15 @@ def textacyTopicModeling(ngrams, min_df, max_df, corpus, n_topics, topicModel='l
|
|||
|
||||
start = time.time()
|
||||
|
||||
top_topic_words = 10
|
||||
top_topic_words = 7
|
||||
top_document_labels_per_topic = 5
|
||||
|
||||
# http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix
|
||||
weighting = ('tf' if topicModel == 'lda' else 'tfidf')
|
||||
|
||||
|
||||
|
||||
|
||||
####################'####################
|
||||
|
||||
|
||||
|
@ -120,16 +123,31 @@ def textacyTopicModeling(ngrams, min_df, max_df, corpus, n_topics, topicModel='l
|
|||
print()
|
||||
print()
|
||||
|
||||
|
||||
# termite plot
|
||||
n_terms = int(n_topics*top_topic_words)
|
||||
sort_terms_by = 'seriation' #'seriation', 'weight', 'index', 'alphabetical'
|
||||
rank_terms_by = 'corpus' # 'corpus', 'topic'
|
||||
model.termite_plot(doc_term_matrix, id2term,
|
||||
|
||||
n_terms=n_terms,
|
||||
sort_terms_by=sort_terms_by,
|
||||
rank_terms_by=rank_terms_by+'_weight',
|
||||
|
||||
save="/home/jannis.grundmann/PycharmProjects/topicModelingTickets/results/{}_{}_{}_{}_{}.png".format(topicModel,n_topics,n_terms,sort_terms_by,rank_terms_by))
|
||||
|
||||
|
||||
|
||||
end = time.time()
|
||||
logprint("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel))
|
||||
|
||||
|
||||
def jgibbsLLDA(corpus, path2save_results, top_topic_words=15, add_default_topic=False):
|
||||
def jgibbsLLDA(corpus, path2save_results, top_topic_words=7, add_default_topic=False):
|
||||
##################### LLDA Topic Modeling via JGibbsLabledLDA ##############################################
|
||||
|
||||
start = time.time()
|
||||
|
||||
# build citionary of ticketcategories
|
||||
# build dictionary of ticketcategories
|
||||
labelist = []
|
||||
|
||||
for texdoc in corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist):
|
||||
|
@ -161,7 +179,7 @@ def jgibbsLLDA(corpus, path2save_results, top_topic_words=15, add_default_topic=
|
|||
# wait for file to exist
|
||||
while not os.path.exists(LLDA_filepath):
|
||||
time.sleep(1)
|
||||
|
||||
#top_topic_words=1
|
||||
logprint("")
|
||||
logprint("start LLDA:")
|
||||
# run JGibsslda file
|
||||
|
@ -208,7 +226,7 @@ def jgibbsLLDA(corpus, path2save_results, top_topic_words=15, add_default_topic=
|
|||
|
||||
reverse_labeldict = {v: k for k, v in labeldict.items()}
|
||||
result = []
|
||||
regex = re.compile(r'Topic [0-9]')
|
||||
regex = re.compile(r'Topic [0-9]*')
|
||||
for line in output.splitlines():
|
||||
|
||||
findall = regex.findall(line)
|
||||
|
@ -231,16 +249,22 @@ def jgibbsLLDA(corpus, path2save_results, top_topic_words=15, add_default_topic=
|
|||
logprint("\n\n\nTime Elapsed Topic Modeling JGibbsLLDA:{0} min\n\n".format((end - start) / 60))
|
||||
|
||||
|
||||
def main(use_raw=False):
|
||||
def main(use_raw=False, algorithm="llda"):
|
||||
logprint("Topic Modeling: {0}".format(datetime.now()))
|
||||
|
||||
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
|
||||
|
||||
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
|
||||
|
||||
|
||||
if use_raw:
|
||||
# fehler Unknown document label ( X ) for document 352.
|
||||
preCorpus_name = "de" + "_raw_ticket"
|
||||
resultspath = FILEPATH + "results/raw"
|
||||
|
||||
else:
|
||||
preCorpus_name = "de" + "_pre_ticket"
|
||||
resultspath = FILEPATH + "results/pre"
|
||||
|
||||
# load raw corpus and create new one
|
||||
de_corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path)
|
||||
|
@ -252,11 +276,10 @@ def main(use_raw=False):
|
|||
# idee https://gate.ac.uk/family/
|
||||
|
||||
# todo gescheites tf(-idf) maß finden
|
||||
# todo pro model: gelabelten corpus, ergebnisse und labeldict speichern
|
||||
# todo topics zusammenfassen
|
||||
# frage wieviele tickets pro topic?
|
||||
|
||||
|
||||
"""
|
||||
ngrams = 1
|
||||
min_df = 1
|
||||
max_df = 1.0
|
||||
|
@ -264,7 +287,7 @@ def main(use_raw=False):
|
|||
# weighting ='tfidf'
|
||||
named_entities = False
|
||||
|
||||
"""
|
||||
|
||||
printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting=weighting)
|
||||
printvecotorization(ngrams=1, min_df=1, max_df=0.5, weighting=weighting)
|
||||
printvecotorization(ngrams=1, min_df=1, max_df=0.8, weighting=weighting)
|
||||
|
@ -274,34 +297,30 @@ def main(use_raw=False):
|
|||
printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.8, weighting=weighting)
|
||||
"""
|
||||
|
||||
if use_raw:
|
||||
resultspath = FILEPATH + "results/raw"
|
||||
else:
|
||||
resultspath = FILEPATH + "results/pre"
|
||||
if algorithm == "llda":
|
||||
top_topic_words = 5
|
||||
add_default_topic = False
|
||||
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdefault" if add_default_topic else "")
|
||||
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
|
||||
add_default_topic=add_default_topic)
|
||||
|
||||
top_topic_words = 5
|
||||
add_default_topic = False
|
||||
path2save_results = resultspath + "{}_{}.txt".format(top_topic_words, add_default_topic)
|
||||
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
|
||||
add_default_topic=add_default_topic)
|
||||
top_topic_words = 5
|
||||
add_default_topic = True
|
||||
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdefault" if add_default_topic else "")
|
||||
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
|
||||
add_default_topic=add_default_topic)
|
||||
|
||||
top_topic_words = 5
|
||||
add_default_topic = True
|
||||
path2save_results = resultspath + "{}_{}.txt".format(top_topic_words, add_default_topic)
|
||||
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
|
||||
add_default_topic=add_default_topic)
|
||||
top_topic_words = 10
|
||||
add_default_topic = False
|
||||
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdefault" if add_default_topic else "")
|
||||
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
|
||||
add_default_topic=add_default_topic)
|
||||
|
||||
top_topic_words = 10
|
||||
add_default_topic = False
|
||||
path2save_results = resultspath + "{}_{}.txt".format(top_topic_words, add_default_topic)
|
||||
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
|
||||
add_default_topic=add_default_topic)
|
||||
|
||||
top_topic_words = 10
|
||||
add_default_topic = True
|
||||
path2save_results = resultspath + "{}_{}.txt".format(top_topic_words, add_default_topic)
|
||||
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
|
||||
add_default_topic=add_default_topic)
|
||||
top_topic_words = 10
|
||||
add_default_topic = True
|
||||
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdefault" if add_default_topic else "")
|
||||
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
|
||||
add_default_topic=add_default_topic)
|
||||
|
||||
# no_below = 20
|
||||
# no_above = 0.5
|
||||
|
@ -310,42 +329,70 @@ def main(use_raw=False):
|
|||
# n_topics = len(LABELDICT)#len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
|
||||
|
||||
|
||||
else:
|
||||
|
||||
"""
|
||||
topicModeling(ngrams = 1,
|
||||
min_df = 1,
|
||||
max_df = 1.0,
|
||||
topicModel = 'lda',
|
||||
n_topics = len(LABELDICT),
|
||||
corpi=de_corpus)
|
||||
# build dictionary of ticketcategories
|
||||
labelist = []
|
||||
|
||||
topicModeling(ngrams = 1,
|
||||
min_df = 0.1,
|
||||
max_df = 0.6,
|
||||
topicModel = 'lda',
|
||||
n_topics = len(LABELDICT),
|
||||
corpi=de_corpus)
|
||||
for texdoc in de_corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist):
|
||||
labelist.append(texdoc.metadata["categoryName"])
|
||||
|
||||
topicModeling(ngrams = (1,2),
|
||||
min_df = 1,
|
||||
max_df = 1.0,
|
||||
topicModel = 'lda',
|
||||
n_topics = len(LABELDICT),
|
||||
corpi=de_corpus)
|
||||
labeldict = {k: v for v, k in enumerate(labelist)}
|
||||
|
||||
topicModeling(ngrams = (1,2),
|
||||
min_df = 0.1,
|
||||
max_df = 0.6,
|
||||
topicModel = 'lda',
|
||||
n_topics = len(LABELDICT),
|
||||
corpi=de_corpus)
|
||||
n_topics = 15
|
||||
|
||||
topicModeling(ngrams = (1,2),
|
||||
min_df = 0.2,
|
||||
max_df = 0.8,
|
||||
topicModel = 'lda',
|
||||
n_topics = 20,
|
||||
corpi=de_corpus)
|
||||
|
||||
textacyTopicModeling(ngrams = 1,
|
||||
min_df = 1,
|
||||
max_df = 0.8,
|
||||
topicModel = algorithm,
|
||||
n_topics =n_topics,
|
||||
corpus=de_corpus)
|
||||
"""
|
||||
textacyTopicModeling(ngrams = (1,2),
|
||||
min_df = 1,
|
||||
max_df = 0.8,
|
||||
topicModel = algorithm,
|
||||
n_topics =n_topics,
|
||||
corpus=de_corpus)
|
||||
|
||||
"""
|
||||
|
||||
"""
|
||||
textacyTopicModeling(ngrams = 1,
|
||||
min_df = 0.1,
|
||||
max_df = 0.6,
|
||||
topicModel = algorithm,
|
||||
n_topics =n_topics,
|
||||
corpus=de_corpus)
|
||||
|
||||
|
||||
|
||||
|
||||
textacyTopicModeling(ngrams = (1,2),
|
||||
min_df = 1,
|
||||
max_df = 1.0,
|
||||
topicModel = algorithm,
|
||||
n_topics =n_topics,
|
||||
corpus=de_corpus)
|
||||
|
||||
textacyTopicModeling(ngrams = (1,2),
|
||||
min_df = 0.1,
|
||||
max_df = 0.6,
|
||||
topicModel = algorithm,
|
||||
n_topics =n_topics,
|
||||
corpus=de_corpus)
|
||||
|
||||
|
||||
|
||||
|
||||
textacyTopicModeling(ngrams = (1,2),
|
||||
min_df = 0.2,
|
||||
max_df = 0.8,
|
||||
topicModel = algorithm,
|
||||
n_topics = 20,
|
||||
corpus=de_corpus)
|
||||
"""
|
||||
|
||||
|
||||
|
||||
|
@ -353,7 +400,6 @@ def main(use_raw=False):
|
|||
|
||||
|
||||
|
||||
"""
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
Loading…
Reference in New Issue