2017-08-29 15:01:17 +02:00
# -*- coding: utf-8 -*-
2017-12-19 17:12:35 +01:00
import matplotlib
matplotlib . use ( ' Agg ' )
2017-10-10 14:42:09 +02:00
from datetime import datetime
2017-08-29 15:01:17 +02:00
import csv
2017-10-10 14:42:09 +02:00
import sys
2017-12-19 17:12:35 +01:00
from textacy import Vectorizer
import draw1
2017-10-16 14:01:38 +02:00
from miscellaneous import *
from datetime import datetime
import time
import textacy
from scipy import *
2017-12-19 17:12:35 +01:00
from scipy . stats import threshold
import draw
from spacy . tokens . token import Token as SpacyToken
from spacy . tokens . span import Span as SpacySpan
from topicModeling import jgibbsLLDAv2
2017-10-17 10:13:49 +02:00
import os
2017-12-19 17:12:35 +01:00
from ressources . iir . lda . llda import *
2017-10-10 14:42:09 +02:00
2017-10-17 10:13:49 +02:00
csv . field_size_limit ( sys . maxsize )
FILEPATH = os . path . dirname ( os . path . realpath ( __file__ ) ) + " / "
2017-10-10 14:42:09 +02:00
2017-10-16 14:01:38 +02:00
# load config
2017-10-17 10:13:49 +02:00
config_ini = FILEPATH + " config.ini "
2017-12-08 11:06:07 +01:00
ressources_path = FILEPATH + " ressources/ "
2017-10-10 14:42:09 +02:00
2017-09-11 12:12:28 +02:00
config = ConfigParser . ConfigParser ( )
2017-10-10 14:42:09 +02:00
with open ( config_ini ) as f :
2017-09-11 12:12:28 +02:00
config . read_file ( f )
2017-12-19 17:12:35 +01:00
"""
def init_glabal_vars ( ) :
global THESAURUS , WORDS , LEMMAS , NOUNS , VORNAMEN , DE_STOP_WORDS , EN_STOP_WORDS , WHITELIST , FOOTER_FLAG , CURRENT_TICKET
2017-08-31 14:54:01 +02:00
2017-12-19 17:12:35 +01:00
THESAURUS = { }
WORDS = { }
LEMMAS = { }
NOUNS = { }
VORNAMEN = { }
DE_STOP_WORDS = { }
EN_STOP_WORDS = { }
WHITELIST = { }
CURRENT_TICKET = " "
2017-10-18 17:37:20 +02:00
2017-10-25 09:46:44 +02:00
REGEX_SPECIALCHAR = r ' [` \ -=~ % ^&*()_+ \ [ \ ] {} ; \' \\ : " |</>] ' #+r',.'
REGEX_TOPLVL = r ' \ .[a-z] { 2,3}( \ .[a-z] { 2,3})? '
2017-12-19 17:12:35 +01:00
"""
2017-10-25 09:46:44 +02:00
2017-12-19 17:12:35 +01:00
"""
2017-10-16 14:01:38 +02:00
THESAURUS = { }
2017-10-18 17:37:20 +02:00
WORDS = { }
LEMMAS = { }
NOUNS = { }
VORNAMEN = { }
DE_STOP_WORDS = { }
EN_STOP_WORDS = { }
2017-08-29 15:01:17 +02:00
2017-11-06 12:54:59 +01:00
2017-10-10 14:42:09 +02:00
############# filter tokens
2017-08-31 14:54:01 +02:00
2017-11-06 12:54:59 +01:00
def filterTokens ( tokens , funclist ) :
# in:tokenlist, funclist
# out: tokenlist
for f in funclist :
tokens = list ( filter ( f , tokens ) )
2017-11-27 12:49:05 +01:00
for tok in tokens :
if tok . pos_ == " NOUN " :
x = 0
2017-11-06 12:54:59 +01:00
return tokens
2017-10-10 14:42:09 +02:00
def keepPOS ( pos_list ) :
return lambda tok : tok . pos_ in pos_list
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
def keepNouns ( noun_list = NOUNS ) :
2017-11-27 12:49:05 +01:00
#return lambda tok: tok.lower_ in noun_list
return lambda tok : tok . lower_ in noun_list or tok . pos_ == " NOUN "
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
def removePOS ( pos_list ) :
return lambda tok : tok . pos_ not in pos_list
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
def removeWords ( words , keep = None ) :
2017-09-11 12:12:28 +02:00
if hasattr ( keep , ' __iter__ ' ) :
for k in keep :
try :
words . remove ( k )
except ValueError :
pass
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
return lambda tok : tok . lower_ not in words
def keepENT ( ent_list ) :
return lambda tok : tok . ent_type_ in ent_list
def removeENT ( ent_list ) :
return lambda tok : tok . ent_type_ not in ent_list
def remove_words_containing_Numbers ( ) :
return lambda tok : not bool ( re . search ( ' \ d ' , tok . lower_ ) )
def remove_words_containing_topLVL ( ) :
2017-10-16 14:01:38 +02:00
return lambda tok : not bool ( re . search ( REGEX_TOPLVL , tok . lower_ ) )
2017-10-10 14:42:09 +02:00
def remove_words_containing_specialCharacters ( ) :
2017-10-16 14:01:38 +02:00
return lambda tok : not bool ( re . search ( REGEX_SPECIALCHAR , tok . lower_ ) )
2017-10-10 14:42:09 +02:00
def remove_long_words ( ) :
return lambda tok : not len ( tok . lower_ ) < 2
def remove_short_words ( ) :
return lambda tok : not len ( tok . lower_ ) > 35
def remove_first_names ( ) :
return lambda tok : tok . lower_ not in [ name . lower ( ) for name in VORNAMEN ]
############# strings
2017-12-19 17:12:35 +01:00
"""
2017-10-10 14:42:09 +02:00
def remove_addresses ( string ) :
2017-11-06 12:54:59 +01:00
pass # todo remove_addresses idee postal.parser und zu metadaten hinzufügen
2017-10-10 14:42:09 +02:00
2017-12-19 17:12:35 +01:00
def lemmatizeWord ( word , lemma_dict , n = 5 ) :
2017-10-10 14:42:09 +02:00
for i in range ( n ) :
try :
2017-10-16 14:01:38 +02:00
word = lemma_dict [ word . lower ( ) ] if word . lower ( ) in lemma_dict . keys ( ) else word . lower ( )
2017-10-10 14:42:09 +02:00
except :
print ( word )
return word
2017-12-11 12:10:40 +01:00
2017-12-19 17:12:35 +01:00
def getFirstSynonym ( word , thesaurus , n = 3 ) :
2017-12-08 11:06:07 +01:00
for i in range ( n ) :
2017-12-11 12:10:40 +01:00
2017-12-08 11:06:07 +01:00
try :
2017-12-11 12:10:40 +01:00
if word in thesaurus . keys ( ) :
return thesaurus [ word ]
2017-12-08 11:06:07 +01:00
2017-12-11 12:10:40 +01:00
elif word . title ( ) in thesaurus . keys ( ) :
return thesaurus [ word . title ( ) ]
2017-12-08 11:06:07 +01:00
2017-12-11 12:10:40 +01:00
elif word . lower ( ) in thesaurus . keys ( ) :
return thesaurus [ word . lower ( ) ]
else :
return word
except :
print ( " THESAURUSFEHLER BEI: {} " . format ( word ) )
return word
2017-08-31 14:54:01 +02:00
2017-12-19 17:12:35 +01:00
"""
2017-10-10 14:42:09 +02:00
############# stringcleaning
2017-09-11 13:24:20 +02:00
2017-11-06 12:54:59 +01:00
def processContentstream ( textstream , parser , token_filterlist = None ) :
2017-10-18 17:37:20 +02:00
#pre parse
textstream = preparse ( textstream )
pipe = parser . pipe ( textstream )
for doc in pipe :
tokens = [ tok for tok in doc ]
# in parse
if token_filterlist is not None :
tokens = filterTokens ( tokens , token_filterlist )
# post parse
2017-11-27 12:49:05 +01:00
#todo STELLSCHRAUBE tokens = [postparse(tok) for tok in tokens] #todo: informationsverlust von pos,tag etc.!
tokens = [ tok . lower_ for tok in tokens ]
2017-10-18 17:37:20 +02:00
yield " " . join ( tokens )
def preparse ( stringstream ) :
for string in stringstream :
# cut_after
words = [ " gruss " , " grusse " , " gruesse " , " gruessen " , " grusses " ]
for gr in words :
if gr in string :
string = string . rpartition ( gr ) [ 0 ]
break
yield string
def postparse ( toktext ) :
2017-12-19 17:12:35 +01:00
2017-10-18 17:37:20 +02:00
toktext = toktext . lower_
# remove_words_containing_topLVL
toktext = toktext if not re . search ( REGEX_TOPLVL , toktext ) else " "
2017-09-11 13:00:03 +02:00
2017-10-18 17:37:20 +02:00
# lemmatize
toktext = lemmatizeWord ( toktext )
2017-09-11 13:00:03 +02:00
2017-10-18 17:37:20 +02:00
# synonyme normalisieren
toktext = getFirstSynonym ( toktext )
2017-09-11 13:00:03 +02:00
2017-10-18 17:37:20 +02:00
# autocorrect
toktext = autocorrectWord ( toktext )
return toktext
2017-09-11 13:00:03 +02:00
2017-10-16 14:01:38 +02:00
def processDictstream ( dictstream , funcdict , parser ) :
2017-12-19 17:12:35 +01:00
2017-10-10 14:42:09 +02:00
for dic in dictstream :
result = { }
for key , value in dic . items ( ) :
2017-09-11 13:00:03 +02:00
2017-10-10 14:42:09 +02:00
if key in funcdict :
2017-09-11 13:00:03 +02:00
2017-10-10 14:42:09 +02:00
doc = parser ( value )
tokens = [ tok for tok in doc ]
funclist = funcdict [ key ]
2017-09-11 13:00:03 +02:00
2017-10-10 14:42:09 +02:00
tokens = filterTokens ( tokens , funclist )
2017-09-11 13:00:03 +02:00
2017-10-10 14:42:09 +02:00
result [ key ] = " " . join ( [ tok . lower_ for tok in tokens ] )
2017-09-11 13:00:03 +02:00
2017-10-10 14:42:09 +02:00
else :
result [ key ] = value
yield result
2017-09-11 13:00:03 +02:00
2017-12-19 17:12:35 +01:00
"""
2017-10-16 14:01:38 +02:00
##################################################################################################
2017-09-11 13:00:03 +02:00
2017-10-12 15:57:56 +02:00
2017-12-19 17:12:35 +01:00
"""
def extract_from_corpus ( corpus ) :
"""
"""
Extract from each doc from a corpus a string containing token_texts
2017-11-29 16:31:30 +01:00
2017-10-12 15:57:56 +02:00
2017-12-19 17:12:35 +01:00
: param corpus : textacy . Corpus
: return : string - gen
"""
"""
2017-10-12 15:57:56 +02:00
2017-12-19 17:12:35 +01:00
weighting = ' tf ' #'tfidf'
ngrams = 1
min_df = 1
max_df = 0.9
2017-10-12 15:57:56 +02:00
2017-12-19 17:12:35 +01:00
###### vectorize corpi
2017-09-11 13:00:03 +02:00
2017-12-19 17:12:35 +01:00
vectorizer = Vectorizer ( weighting = weighting , min_df = min_df , max_df = max_df )
2017-10-16 14:01:38 +02:00
2017-12-19 17:12:35 +01:00
terms_list = ( doc . to_terms_list ( ngrams = ngrams , named_entities = False , as_strings = True ) for doc in corpus )
doc_term_matrix = vectorizer . fit_transform ( terms_list )
id2term = vectorizer . id_to_term #__getattribute__("id_to_term")
2017-10-16 14:01:38 +02:00
2017-12-11 12:10:40 +01:00
# WHITELIST erstellen. Enthält zumindest die evtuellen Topics
2017-12-08 11:06:07 +01:00
WHITELIST = [ " boss " , " sap " , " firefox " ] #todo autogenerierung relv. techn. begriffe
kb_cats = [ ' eldorado ' , ' cws_confluence ' , ' wsus ' , ' mail groupware ' , ' d.3 dms ' , ' serviceportal ' , ' softwarelizenzen ' , ' sophos ' , ' webserver ' , ' sap ' , ' ftp server ' , ' dhcp ' , ' tonerboerse ' , ' mailalias ' , ' arbeitsplatzsupport ' , ' mediendienste ' , ' mailverteiler ' , ' uni mail ' , ' basis app ' , ' videoschnitt ' , ' DEFAULT ' , ' verwaltung ' , ' matrix42_hilfe ' , ' hoersaaluebertragung ' , ' redmine ' , ' uniflow ' , ' keine rueckantwort ' , ' pools ' , ' leitung ' , ' netze ' , ' konteneinsicht ' , ' kennwort aenderung ' , ' datanet ' , ' neuanschluss ' , ' semesterticket ' , ' asknet ' , ' veranstaltungen ' , ' housing ' , ' fk 16 ' , ' fiona ' , ' betrieb ' , ' vorlagenerstellung ' , ' studierendensekretariat ' , ' pvp ' , ' mobilfunkvertraege ' , ' ausleihe ' , ' web ' , ' spam phishing ' , ' sap urlaub ' , ' evaexam ' , ' vorlesungsaufzeichnung ' , ' firewall betreuung ' , ' ub basis it ' , ' virtuelle desktops citrix ' , ' fk15 ' , ' virtuelle server ' , ' lizenzserver ' , ' elektronisches telefonbuch ' , ' joomla itmc website ' , ' weiterentwicklung ' , ' serversupport ' , ' wlan ' , ' kurse ' , ' technik ' , ' raumkalender ' , ' backup tsm ' , ' haustechnik ' , ' voicemail box ' , ' facility ' , ' unicard ausgabe ' , ' mdm mobile device management ' , ' entwicklung ' , ' webgestaltung ' , ' unicard sperrung ' , ' forensic ' , ' basis applikationen ' , ' overhead projektor ' , ' plagiatserkennung ' , ' uniaccount zugangsdaten ' , ' zentrale webserver ' , ' webmailer ' , ' fk12 webauftritt ' , ' plotter ' , ' campus management ' , ' ub_stoerungen ' , ' rundmail ' , ' telefon ' , ' raumbuchung ' , ' fk12 migration ' , ' dienstreise ' , ' hardware ' , ' it sicherheit sic ' , ' hochleistungsrechnen ' , ' unicard ' , ' sos ' , ' benutzerverwaltung_probleme ' , ' confluence ' , ' vpn ' , ' zhb ' , ' campus app ' , ' itmc_aufgaben ' , ' sicherheit ' , ' schulungsraum verwaltung ' , ' unicard produktion ' , ' schulung ' , ' video ' , ' dokoll support ' , ' sd ' , ' servicedesk ' , ' v2 campus app feedback ' , ' lido ' , ' app feedback ' , ' ibz raumbuchung ' , ' hcm stammdaten ' , ' itmc_stoerungen ' , ' boss service desk ' , ' exchange nutzung ' , ' office ' , ' rektorat -buero ' , ' bestellung ' , ' moodle ' , ' fk raumplanung 09 ' , ' aenderung ' , ' neuausstattung ' , ' benutzerverwaltung ' , ' rechnerraeume ' , ' designentwicklung ' , ' fk 12 ' , ' werkstoffe lehrstuhl bauwesen ' , ' server storage ' , ' beantragung ' , ' visitenkartenproduktion ' , ' gastaufenthalt ' , ' telefonkonferenzen ' , ' raumbuchungssysteme ' , ' fk14_test ' , ' e mail dienste ' , ' grafik ' , ' ews ' , ' itmc schulungsraeume ' , ' tsm ' , ' softwareverteilung ' , ' beamer ' , ' lizenzmanagement ' , ' fileserver einrichtung ' , ' redmine projektverwaltung ' , ' service desk itmc ' , ' pruefungsmanagement ' , ' prozess- und projektmanagement ' , ' formulare antraege ' , ' namensaenderung ' , ' verkauf ' , ' software ' , ' itmc medienraeume ef50 ' , ' zugangsdaten ' , ' medientechnik ' , ' lan ' , ' veeam ' , ' unicard redaktionsteam ' , ' changes ' , ' service portal ' , ' limesurvey ' , ' dns ' , ' dokoll pvp ' , ' uhren ' , ' nrw ticket ' , ' itmc_als ' , ' linux bs ' , ' werkvertraege ' , ' blogs wikis foren ' , ' test ' , ' abmeldung ' , ' desktop & basisdienste ' , ' telefonzentrale ' , ' siport zugangskontrolle ' , ' antrag auf rechnungserstellung ' , ' verschiedene aufgaben ' , ' kundenserver ' , ' medienraeume ef50 ' , ' videokonferenzen ' , ' benutzungsverwaltung ' , ' mailverteiler exchange ' , ' lsf ' , ' telefonabrechnung ' , ' werkstaette ' , ' uniaccount ' , ' outlook_einrichtung ' , ' itmc webauftritt ' , ' zertifikate server dfn ' , ' allgemein ' , ' umzug ' , ' service portal redaktion ' , ' pos ' , ' beschaffung ' , ' boss ' , ' hacker angriff ' , ' software entwicklung ' , ' cd dvd produktion ' , ' sam spider ' , ' viren ' , ' kursplanung ' , ' itmc pools ' , ' kms ' , ' e learning ' ]
kb_keys = [ ' zugriff_onlinedienste_rueckmeldung ' , ' uniaccount ' , ' freischaltung ' , ' asknet ' , ' eduroam ' , ' donnerstagsmail namensaenderung ' , ' asiexception ' , ' lsf ' , ' kundenantwort ' , ' chip ' , ' unitymedia ' , ' citavi ' , ' fehler ' , ' windows beziehen ' , ' wlan ' , ' ipv6 ' , ' freischaltung verzoegert ' , ' betrag ' , ' " defekte karte " ' , ' risse ' , ' laden ' , ' sap portal anderer modus ' , ' goeke ' , ' informationen des itmc zum einsatz ' , ' transport wurde durchgefuehrt. ' , ' wi-fi ' , ' unicard_auszahlung ' , ' ausleihe ' , ' unimail ' , ' uni-account ' , ' unicard ' , ' beantragung ' , ' nrw-ticket ' , ' printservice ' , ' dms ' , ' ip6 ' , ' transport und beschreibung zum transportauftrag ! ' , ' wlan passwort ' , ' dokumentenmanagementsystem ' , ' webmailer ' , ' vpn ' , ' repository ' , ' unicard ' , ' projekte ' , ' eingeschrieben ' , ' unicard abholung oeffnungszeiten ' , ' d3 ' , ' beantragung ' , ' app tu-dortmund feedback ' , ' semester ticket ' , ' redmine ' , ' git ' , ' geldkarte ' , ' outlook_exchange ' , ' spam standardmeldung phishing ' , ' automatische aktualisierung der selbst angelegten kontakte in outlook ' , ' " beschaedigte unicard " ' , ' elektronische telefonbuch ' , ' boss ' , ' wwrite ' , ' DEFAULT ' , ' anyconnect ' , ' wifi ' ]
kb_subjs = [ ' sd_office 365 plus support ' , ' citavi_lizenzschluessel_nicht bekommen ' , ' uni card ' , ' sd_office 356 plus bestellung ' , ' sd_gastaufenthalter ' , ' sd_outlook kontakte automatische aktualisierung ' , ' benutzer zum redmine hinzufuegen ' , ' sd_matlab lizenzdatei pc-pools ' , ' sd_tu-app feedback standard ' , ' vpn_ipsec_stoerung ' , ' vpn verbindung fuer unitymedia kunden ' , ' ub_prod_abholung_ abholfristen_benachrichtigungen ' , ' einrichtung des eduroam netzwerks ' , ' sd_webmailer_threadanzeige und weiterleitung ' , ' sd_wlan passwort setzen ' , ' ub_prod_namenskorrektur_student ' , ' sd_unimail imap_pop3 ' , ' sd_outlook_in_exchange_einbinden ' , ' sd_keine rueckantwort kunde ' , ' sd_asknet_und_dreamspark ' , ' sd_heirat_namensaenderung_student ' , ' bd_unicard_nicht_eingeschrieben ' , ' wlan ' , ' sd_telefonbuch_prof_eintragung ' , ' change produktiv nehmen chn00146 - transport e01k909284 ' , ' ungueltiges ticket siehe journal ' , ' apps_dms_d.3 client installation/login d.3 funktioniert nicht ' , ' d.3 client installation ' , ' unicard_restbetrag_auszahlung ' , ' cm_asiexception ' , ' sd_origin_workaround ' , ' sd_vpn_aktualisierung ' , ' problem mit der beantragung von der unicard ' , ' sd_unicard fehlerhafte geldbuchung ' , ' sd_login tu portals english ' , ' sd_gmx_web.de ' , ' studierendenausweis ' , ' sd_citavi ' , ' sd_fk9 test ' , ' sd_webmailer_thread-anzeige ' , ' bd_unicard_geldkarte_laden ' , ' ub_unicard_unicard mit vollmacht abholen ' , ' sd_stellenausschreibung schwarzes brett ' , ' freischaltung uniaccount ' , ' sd_asknet_mitarbeiter_softwarebestellung ' , ' how to setup eduroam ' , ' sd_citavi bestellung ' , ' unicard vergessen abzuholen und nicht mehr da ' , ' sd_unimail zu exchange ' , ' sd_diensthandy beschaffung ' , ' sd_sap konteneinsicht antrag ' , ' sd_unicard_defekt ' , ' sd_webmailer einrichtung weiterleitung ' , ' sd_kurs-angebote anmeldung ' , ' m42_dokumentationen_zu_neuen_ous ' , ' sd_sap_initialkennwort ' , ' sd_sap_freischaltung ohne passwortaenderung ' , ' sd_telefonbuch-eintrag_aenderung ' , ' sd_pruefungsamt ' , ' sd_phishing ' , ' apps_dms-passwort d.3 ' , ' sd_goeke drucker ' , ' sd_sap_dienstreise ' , ' unicard nochmal beantragen ' , ' sd_outlook anmeldung gestoert ' , ' sd_citavi_support ' , ' DEFAULT ' , ' sd_geraeteausleihe ' , ' sd_account_abmelden ' , ' sd_uniaccount freischaltung verzoegert englisch ' , ' ub_beschaedigte unicard ' , ' sd_gleitzeitanlage_dez3_stoerung ' , ' transportdurchfuehung ' , ' sd_sap_initialkennwort_englisch ' , ' sd_antwort_phishingmail ' , ' sd_namensaenderung mitarbeiter ' , ' re: elektroarbeiten fuer leitsystem 2. und 3. obergeschoss ' , ' lsf freischaltung als mitarbeiter/in ' , ' ub_unicard_spaetere abholung moeglich? ' , ' sd_antrag funktionale mailadresse ' , ' sd_apple-on-campus ' , ' sd_office365_asknet ' , ' sd_sophos download ' , ' sd_freischaltung uniaccount verzoegert ' , ' ub_unicard_zusendung der karte moeglich? ' , ' ohne betreff ' , ' sd_immatrikulationsbescheinigung_portal ' , ' sd_studisek_buchung_semesterbeitrag ' , ' sd_studisek_englisch ' , ' probleme mit der namensaenderung/ neue unicard ' , ' sd_telefonbuch, neues system ' , ' fehlender eintrag im elektronischen telefonbuch ' , ' sd_boss_notenverbuchung ' , ' sd_laufzeit unimail account ' , ' sd_semesterticket ' , ' sd_kontakt_asknet ' , ' windows 10 ' , ' sd_login_tu_portale ' , ' ub_geldchip-problem bei uc ' , ' sd_zugriff_onlinedienste_rueckmeldung ' , ' sd_wlan-gastkonto ' , ' sd_tu_app_keine internetverbindung ' , ' sd_uniaccount_ehemalige_passwortaenderung ' , ' sd_verlust/antrag unicard ' , ' sd_sap_konteneinsicht_ workaround ' , ' apps_redmine_repository ' , ' sd_itmc kurse anmeldebestaetigung ' , ' sd_mail_als_anhang ' , ' bd_unicard_chip_defekt ' , ' probleme mit unicard ' , ' ub_unicard_abholungszeiten ' , ' sd_falsche_personendaten ' , ' sd_uniaccount_ehemalige_studierende ' , ' sd_vpn anleitungen ' , ' sd_kurs-angebote itmc ' , ' sd_studisek ' , ' sd_login tu portale ' , ' sd_immatrikulationsbescheigung_druckfehler ' , ' ub_drucker kopierer ' , ' sd_vpn_temporaerer fehler ub ' , ' sd_spss_online_bestellung ' , ' sd_dreamspark ' , ' sd_unicard_gesperrte unicard entsperre ' , ' sd_boss-bescheinigung ' , ' bd_goeke_allgemein ' , ' sd_uniaccount_passwortaenderung ' , ' sd_namensaenderung_englisch ' , ' sd_email_namensaenderung ' , ' bd_unicard_freigabe_beantragung ' , ' spam ohne tu bezug ' , ' sd_internationaloffice ' , ' sd
WHITELIST = WHITELIST + kb_cats + kb_keys + kb_subjs
2017-12-11 12:10:40 +01:00
2017-12-08 11:06:07 +01:00
THESAURUS = load_obj ( path2thesaurus_dict )
#WORDS = load_obj(path2wordsdict)
LEMMAS = load_obj ( path2lemmadict )
DE_STOP_WORDS = load_obj ( path2DEstopwordlist )
#EN_STOP_WORDS = load_obj(path2ENstopwordlist)
VORNAMEN = load_obj ( path2firstnameslist )
2017-12-11 12:10:40 +01:00
ents_boss = [ ]
ents_sap = [ ]
2017-12-08 11:06:07 +01:00
for doc in corpus :
result = [ ]
#if doc.metadata["TicketNumber"] == "INC40506":
# breakpoint()
for tok in doc :
2017-12-19 17:12:35 +01:00
"""
#if tok.lower_ =="boss":
# ents_boss.append(tok.ent_type_)
2017-12-08 11:06:07 +01:00
2017-12-19 17:12:35 +01:00
#if tok.lower_ =="sap":
# ents_sap.append(tok.ent_type_)
"""
2017-12-11 12:10:40 +01:00
# wenn in whitelist, direkt übernehmen
2017-12-08 11:06:07 +01:00
if tok . lower_ in WHITELIST :
result . append ( tok . lower_ )
# ignore header, urls , emails, stop, vornamen
lemmatized_word = lemmatizeWord ( tok . text , lemma_dict = LEMMAS )
if lemmatized_word . lower ( ) in [ " sehr " , " geehrt " , " herr " , " herrn " , " herren " , " dame " , " damen " , " liebe " , " lieben " , " hallo " , " guten " , " tag " , " ehre " , " hi " ] \
or tok . like_url \
or tok . like_email \
or tok . is_stop \
or tok . is_punct \
or tok . lower_ in DE_STOP_WORDS \
or tok . lower_ in VORNAMEN :
continue
2017-12-11 12:10:40 +01:00
2017-12-08 11:06:07 +01:00
# cut after footer
2017-12-11 12:10:40 +01:00
if replaceRockDots ( tok . lower_ ) in [ " gruss " , " grusse " , " gruesse " , " gruessen " , " grusses " ] : # fehler schneidet bei zB INC40506 das meiste weg
2017-12-08 11:06:07 +01:00
break
2017-12-19 17:12:35 +01:00
if tok . pos_ in [ " NOUN " , " PROPN " ] \
2017-12-11 12:10:40 +01:00
or tok . ent_type_ in [ " NORP " , " FACILITY " , " ORG " , " PRODUCT " , " WORK_OF_ART " , " LOC " ] :
2017-12-08 11:06:07 +01:00
#or tok.dep_ == "ROOT":
# or tok.lower_ in NOUNS \ #,"PERSON"] \
toktext = tok . lower_
2017-09-11 13:00:03 +02:00
2017-12-08 11:06:07 +01:00
toktext = lemmatized_word
2017-12-11 12:10:40 +01:00
2017-12-19 17:12:35 +01:00
use_thesaurus = False
if use_thesaurus :
# hauptsynonym bilden idee zwar das Huaptsyn bilden und zählen aber die originalen wörter in den llda algo geben
#fehler ergibt nonsens frage werden die gesamtzahl an termen signifikant reduziert?
first_synonym = getFirstSynonym ( toktext , thesaurus = THESAURUS )
if first_synonym is not None or first_synonym != ' ' :
toktext = first_synonym if len ( first_synonym . split ( ) ) == 1 else toktext
2017-10-16 14:01:38 +02:00
2017-12-08 11:06:07 +01:00
result . append ( toktext )
yield " " . join ( result )
2017-12-19 17:12:35 +01:00
#return doc_term_matrix
2017-12-08 11:06:07 +01:00
2017-12-11 12:10:40 +01:00
"""
2017-12-19 17:12:35 +01:00
#print(list(set(ents_sap)))
# ['', 'ORG', 'PERSON', 'LOC']
#print(list(set(ents_boss)))
# ['', 'ORG', 'PERSON', 'LOC']
# LOAD FROM CONFIG
path2thesaurus_dict = ressources_path + config . get ( " thesaurus " , " pickle_file " )
path2wordsdict = ressources_path + config . get ( " spellchecking " , " pickle_file " )
path2lemmadict = ressources_path + config . get ( " lemmatization " , " pickle_file " )
path2firstnameslist = ressources_path + config . get ( " firstnames " , " pickle_file " )
path2DEstopwordlist = ressources_path + config . get ( " de_stopwords " , " pickle_file " )
path2ENstopwordlist = ressources_path + config . get ( " en_stopwords " , " pickle_file " )
custom_words = get_list_from_config ( " preprocessing " , " custom_words " )
corpus_de_path = FILEPATH + config . get ( " de_corpus " , " path " )
de_plainpath = FILEPATH + config . get ( " de_corpus " , " path " ) + " pre_labled_lines.txt "
corpus_en_path = FILEPATH + config . get ( " en_corpus " , " path " )
def norma ( token_or_span ) :
if isinstance ( token_or_span , SpacyToken ) :
return normalize_token ( token_or_span )
elif isinstance ( token_or_span , SpacySpan ) :
result = ' ' . join ( normalize_token ( subtok ) for subtok in token_or_span )
return textacy . preprocess . normalize_whitespace ( result )
else :
msg = ' Input must be a spacy Token or Span, not {} . ' . format ( type ( token_or_span ) )
raise TypeError ( msg )
def normalize_token ( tok ) :
global CURRENT_TICKET , FOOTER_FLAG
# check if CURRENT_TICKET is assigned
try :
CURRENT_TICKET
FOOTER_FLAG
except NameError :
CURRENT_TICKET = tok . doc
FOOTER_FLAG = False
#aktuell verarbeitendes Doc merken. wenn neues Ticket, dann Footer_flag auf False setzen
if tok . doc != CURRENT_TICKET :
FOOTER_FLAG = False
CURRENT_TICKET = tok . doc
# wenn in whitelist, direkt übernehmen
if tok . lower_ in WHITELIST :
return tok . lower_
# ignore header, urls , emails, stop, vornamen, blacklisted
lemmatized_word = lemmatizeWord ( tok . text , lemma_dict = LEMMAS )
if lemmatized_word . lower ( ) in [ " sehr " , " geehrt " , " herr " , " herrn " , " herren " , " dame " , " damen " , " liebe " , " lieben " , " hallo " , " guten " , " tag " , " ehre " , " hi " ] \
or tok . like_url \
or tok . like_email \
or tok . is_stop \
or tok . is_punct \
or tok . lower_ in DE_STOP_WORDS \
or tok . lower_ in VORNAMEN \
or tok . lower_ in BLACKLIST :
return " " #todo bla das alles hier kommt zwischen to_terms_list und fitransform
# cut after footer
if replaceRockDots ( tok . lower_ ) in [ " gruss " , " grusse " , " gruesse " , " gruessen " , " grusses " ] : # fehler schneidet bei zB INC40506 das meiste weg
FOOTER_FLAG = True
if ( tok . pos_ in [ " NOUN " , " PROPN " ] or tok . ent_type_ in [ " NORP " , " FACILITY " , " ORG " , " PRODUCT " , " WORK_OF_ART " , " LOC " ] ) and not FOOTER_FLAG :
#or tok.dep_ == "ROOT":
# or tok.lower_ in NOUNS \ #,"PERSON"] \
toktext = tok . lower_
toktext = lemmatized_word
use_thesaurus = False
if use_thesaurus :
# hauptsynonym bilden idee zwar das Huaptsyn bilden und zählen aber die originalen wörter in den llda algo geben
#fehler ergibt nonsens frage werden die gesamtzahl an termen signifikant reduziert?
first_synonym = getFirstSynonym ( toktext , thesaurus = THESAURUS )
if first_synonym is not None or first_synonym != ' ' :
toktext = first_synonym if len ( first_synonym . split ( ) ) == 1 else toktext
return toktext
else :
return " "
def preprocessCorpus ( cleaned_corpus ) :
logprint ( " Preprocess {0} _corpus at {1} " . format ( cleaned_corpus . lang , datetime . now ( ) ) )
global THESAURUS , WORDS , LEMMAS , NOUNS , VORNAMEN , DE_STOP_WORDS , EN_STOP_WORDS , WHITELIST , BLACKLIST
weighting = ' tf ' #'tfidf'
ngrams = ( 1 , 2 )
min_df = 1
max_df = 0.3
min_label_freq = 1
WHITELIST = [ " boss " , " sap " , " firefox " ] # todo autogenerierung von whitelist
kb_cats = [ ' eldorado ' , ' cws_confluence ' , ' wsus ' , ' mail groupware ' , ' d.3 dms ' , ' serviceportal ' , ' softwarelizenzen ' ,
' sophos ' , ' webserver ' , ' sap ' , ' ftp server ' , ' dhcp ' , ' tonerboerse ' , ' mailalias ' , ' arbeitsplatzsupport ' ,
' mediendienste ' , ' mailverteiler ' , ' uni mail ' , ' basis app ' , ' videoschnitt ' , ' DEFAULT ' , ' verwaltung ' ,
' matrix42_hilfe ' , ' hoersaaluebertragung ' , ' redmine ' , ' uniflow ' , ' keine rueckantwort ' , ' pools ' , ' leitung ' ,
' netze ' , ' konteneinsicht ' , ' kennwort aenderung ' , ' datanet ' , ' neuanschluss ' , ' semesterticket ' , ' asknet ' ,
' veranstaltungen ' , ' housing ' , ' fk 16 ' , ' fiona ' , ' betrieb ' , ' vorlagenerstellung ' ,
' studierendensekretariat ' , ' pvp ' , ' mobilfunkvertraege ' , ' ausleihe ' , ' web ' , ' spam phishing ' , ' sap urlaub ' ,
' evaexam ' , ' vorlesungsaufzeichnung ' , ' firewall betreuung ' , ' ub basis it ' , ' virtuelle desktops citrix ' ,
' fk15 ' , ' virtuelle server ' , ' lizenzserver ' , ' elektronisches telefonbuch ' , ' joomla itmc website ' ,
' weiterentwicklung ' , ' serversupport ' , ' wlan ' , ' kurse ' , ' technik ' , ' raumkalender ' , ' backup tsm ' ,
' haustechnik ' , ' voicemail box ' , ' facility ' , ' unicard ausgabe ' , ' mdm mobile device management ' ,
' entwicklung ' , ' webgestaltung ' , ' unicard sperrung ' , ' forensic ' , ' basis applikationen ' ,
' overhead projektor ' , ' plagiatserkennung ' , ' uniaccount zugangsdaten ' , ' zentrale webserver ' , ' webmailer ' ,
' fk12 webauftritt ' , ' plotter ' , ' campus management ' , ' ub_stoerungen ' , ' rundmail ' , ' telefon ' ,
' raumbuchung ' , ' fk12 migration ' , ' dienstreise ' , ' hardware ' , ' it sicherheit sic ' , ' hochleistungsrechnen ' ,
' unicard ' , ' sos ' , ' benutzerverwaltung_probleme ' , ' confluence ' , ' vpn ' , ' zhb ' , ' campus app ' ,
' itmc_aufgaben ' , ' sicherheit ' , ' schulungsraum verwaltung ' , ' unicard produktion ' , ' schulung ' , ' video ' ,
' dokoll support ' , ' sd ' , ' servicedesk ' , ' v2 campus app feedback ' , ' lido ' , ' app feedback ' ,
' ibz raumbuchung ' , ' hcm stammdaten ' , ' itmc_stoerungen ' , ' boss service desk ' , ' exchange nutzung ' ,
' office ' , ' rektorat -buero ' , ' bestellung ' , ' moodle ' , ' fk raumplanung 09 ' , ' aenderung ' , ' neuausstattung ' ,
' benutzerverwaltung ' , ' rechnerraeume ' , ' designentwicklung ' , ' fk 12 ' , ' werkstoffe lehrstuhl bauwesen ' ,
' server storage ' , ' beantragung ' , ' visitenkartenproduktion ' , ' gastaufenthalt ' , ' telefonkonferenzen ' ,
' raumbuchungssysteme ' , ' fk14_test ' , ' e mail dienste ' , ' grafik ' , ' ews ' , ' itmc schulungsraeume ' , ' tsm ' ,
' softwareverteilung ' , ' beamer ' , ' lizenzmanagement ' , ' fileserver einrichtung ' ,
' redmine projektverwaltung ' , ' service desk itmc ' , ' pruefungsmanagement ' ,
' prozess- und projektmanagement ' , ' formulare antraege ' , ' namensaenderung ' , ' verkauf ' , ' software ' ,
' itmc medienraeume ef50 ' , ' zugangsdaten ' , ' medientechnik ' , ' lan ' , ' veeam ' , ' unicard redaktionsteam ' ,
' changes ' , ' service portal ' , ' limesurvey ' , ' dns ' , ' dokoll pvp ' , ' uhren ' , ' nrw ticket ' , ' itmc_als ' ,
' linux bs ' , ' werkvertraege ' , ' blogs wikis foren ' , ' test ' , ' abmeldung ' , ' desktop & basisdienste ' ,
' telefonzentrale ' , ' siport zugangskontrolle ' , ' antrag auf rechnungserstellung ' , ' verschiedene aufgaben ' ,
' kundenserver ' , ' medienraeume ef50 ' , ' videokonferenzen ' , ' benutzungsverwaltung ' ,
' mailverteiler exchange ' , ' lsf ' , ' telefonabrechnung ' , ' werkstaette ' , ' uniaccount ' , ' outlook_einrichtung ' ,
' itmc webauftritt ' , ' zertifikate server dfn ' , ' allgemein ' , ' umzug ' , ' service portal redaktion ' , ' pos ' ,
' beschaffung ' , ' boss ' , ' hacker angriff ' , ' software entwicklung ' , ' cd dvd produktion ' , ' sam spider ' ,
' viren ' , ' kursplanung ' , ' itmc pools ' , ' kms ' , ' e learning ' ]
kb_keys = [ ' zugriff_onlinedienste_rueckmeldung ' , ' uniaccount ' , ' freischaltung ' , ' asknet ' , ' eduroam ' ,
' donnerstagsmail namensaenderung ' , ' asiexception ' , ' lsf ' , ' kundenantwort ' , ' chip ' , ' unitymedia ' ,
' citavi ' , ' fehler ' , ' windows beziehen ' , ' wlan ' , ' ipv6 ' , ' freischaltung verzoegert ' , ' betrag ' ,
' " defekte karte " ' , ' risse ' , ' laden ' , ' sap portal anderer modus ' , ' goeke ' ,
' informationen des itmc zum einsatz ' , ' transport wurde durchgefuehrt. ' , ' wi-fi ' , ' unicard_auszahlung ' ,
' ausleihe ' , ' unimail ' , ' uni-account ' , ' unicard ' , ' beantragung ' , ' nrw-ticket ' , ' printservice ' , ' dms ' ,
' ip6 ' , ' transport und beschreibung zum transportauftrag ! ' , ' wlan passwort ' ,
' dokumentenmanagementsystem ' , ' webmailer ' , ' vpn ' , ' repository ' , ' unicard ' , ' projekte ' , ' eingeschrieben ' ,
' unicard abholung oeffnungszeiten ' , ' d3 ' , ' beantragung ' , ' app tu-dortmund feedback ' , ' semester ticket ' ,
' redmine ' , ' git ' , ' geldkarte ' , ' outlook_exchange ' , ' spam standardmeldung phishing ' ,
' automatische aktualisierung der selbst angelegten kontakte in outlook ' , ' " beschaedigte unicard " ' ,
' elektronische telefonbuch ' , ' boss ' , ' wwrite ' , ' DEFAULT ' , ' anyconnect ' , ' wifi ' ]
kb_subjs = [ ' sd_office 365 plus support ' , ' citavi_lizenzschluessel_nicht bekommen ' , ' uni card ' ,
' sd_office 356 plus bestellung ' , ' sd_gastaufenthalter ' ,
' sd_outlook kontakte automatische aktualisierung ' , ' benutzer zum redmine hinzufuegen ' ,
' sd_matlab lizenzdatei pc-pools ' , ' sd_tu-app feedback standard ' , ' vpn_ipsec_stoerung ' ,
' vpn verbindung fuer unitymedia kunden ' , ' ub_prod_abholung_ abholfristen_benachrichtigungen ' ,
' einrichtung des eduroam netzwerks ' , ' sd_webmailer_threadanzeige und weiterleitung ' ,
' sd_wlan passwort setzen ' , ' ub_prod_namenskorrektur_student ' , ' sd_unimail imap_pop3 ' ,
' sd_outlook_in_exchange_einbinden ' , ' sd_keine rueckantwort kunde ' , ' sd_asknet_und_dreamspark ' ,
' sd_heirat_namensaenderung_student ' , ' bd_unicard_nicht_eingeschrieben ' , ' wlan ' ,
' sd_telefonbuch_prof_eintragung ' , ' change produktiv nehmen chn00146 - transport e01k909284 ' ,
' ungueltiges ticket siehe journal ' , ' apps_dms_d.3 client installation/login d.3 funktioniert nicht ' ,
' d.3 client installation ' , ' unicard_restbetrag_auszahlung ' , ' cm_asiexception ' , ' sd_origin_workaround ' ,
' sd_vpn_aktualisierung ' , ' problem mit der beantragung von der unicard ' ,
' sd_unicard fehlerhafte geldbuchung ' , ' sd_login tu portals english ' , ' sd_gmx_web.de ' ,
' studierendenausweis ' , ' sd_citavi ' , ' sd_fk9 test ' , ' sd_webmailer_thread-anzeige ' ,
' bd_unicard_geldkarte_laden ' , ' ub_unicard_unicard mit vollmacht abholen ' ,
' sd_stellenausschreibung schwarzes brett ' , ' freischaltung uniaccount ' ,
' sd_asknet_mitarbeiter_softwarebestellung ' , ' how to setup eduroam ' , ' sd_citavi bestellung ' ,
' unicard vergessen abzuholen und nicht mehr da ' , ' sd_unimail zu exchange ' , ' sd_diensthandy beschaffung ' ,
' sd_sap konteneinsicht antrag ' , ' sd_unicard_defekt ' , ' sd_webmailer einrichtung weiterleitung ' ,
' sd_kurs-angebote anmeldung ' , ' m42_dokumentationen_zu_neuen_ous ' , ' sd_sap_initialkennwort ' ,
' sd_sap_freischaltung ohne passwortaenderung ' , ' sd_telefonbuch-eintrag_aenderung ' , ' sd_pruefungsamt ' ,
' sd_phishing ' , ' apps_dms-passwort d.3 ' , ' sd_goeke drucker ' , ' sd_sap_dienstreise ' ,
' unicard nochmal beantragen ' , ' sd_outlook anmeldung gestoert ' , ' sd_citavi_support ' , ' DEFAULT ' ,
' sd_geraeteausleihe ' , ' sd_account_abmelden ' , ' sd_uniaccount freischaltung verzoegert englisch ' ,
' ub_beschaedigte unicard ' , ' sd_gleitzeitanlage_dez3_stoerung ' , ' transportdurchfuehung ' ,
' sd_sap_initialkennwort_englisch ' , ' sd_antwort_phishingmail ' , ' sd_namensaenderung mitarbeiter ' ,
' re: elektroarbeiten fuer leitsystem 2. und 3. obergeschoss ' , ' lsf freischaltung als mitarbeiter/in ' ,
' ub_unicard_spaetere abholung moeglich? ' , ' sd_antrag funktionale mailadresse ' , ' sd_apple-on-campus ' ,
' sd_office365_asknet ' , ' sd_sophos download ' , ' sd_freischaltung uniaccount verzoegert ' ,
' ub_unicard_zusendung der karte moeglich? ' , ' ohne betreff ' , ' sd_immatrikulationsbescheinigung_portal ' ,
' sd_studisek_buchung_semesterbeitrag ' , ' sd_studisek_englisch ' ,
' probleme mit der namensaenderung/ neue unicard ' , ' sd_telefonbuch, neues system ' ,
' fehlender eintrag im elektronischen telefonbuch ' , ' sd_boss_notenverbuchung ' ,
' sd_laufzeit unimail account ' , ' sd_semesterticket ' , ' sd_kontakt_asknet ' , ' windows 10 ' ,
' sd_login_tu_portale ' , ' ub_geldchip-problem bei uc ' , ' sd_zugriff_onlinedienste_rueckmeldung ' ,
' sd_wlan-gastkonto ' , ' sd_tu_app_keine internetverbindung ' , ' sd_uniaccount_ehemalige_passwortaenderung ' ,
' sd_verlust/antrag unicard ' , ' sd_sap_konteneinsicht_ workaround ' , ' apps_redmine_repository ' ,
' sd_itmc kurse anmeldebestaetigung ' , ' sd_mail_als_anhang ' , ' bd_unicard_chip_defekt ' ,
' probleme mit unicard ' , ' ub_unicard_abholungszeiten ' , ' sd_falsche_personendaten ' ,
' sd_uniaccount_ehemalige_studierende ' , ' sd_vpn anleitungen ' , ' sd_kurs-angebote itmc ' , ' sd_studisek ' ,
' sd_login tu portale ' , ' sd_immatrikulationsbescheigung_druckfehler ' , ' ub_drucker kopierer ' ,
' sd_vpn_temporaerer fehler ub ' , ' sd_spss_online_bestellung ' , ' sd_dreamspark ' ,
' sd_unicard_gesperrte unicard entsperre ' , ' sd_boss-bescheinigung ' , ' bd_goeke_allgemein ' ,
' sd_uniaccount_passwortaenderung ' , ' sd_namensaenderung_englisch ' , ' sd_email_namensaenderung ' ,
' bd_unicard_freigabe_beantragung ' , ' spam ohne tu bezug ' , ' sd_internationaloffice ' ,
' sd_tu-app feedback_englisch ' , ' cm_lsf-boss_freischaltung ' , ' sd-e-mail_adresse_funktional_beantragen ' ,
' sd_vpn_webvpn ' , ' sd_vpn_probleme_mit_unitymedia ' , ' sd_plotauftrag_zv ' , ' sd_beantragung_unicard ' ,
' sd_antworten_korrekt ' , ' ub_prod_neue unicard bei beschaedigung ' ,
' sd_telefonantrag_aenderung_neuantrag ' , ' sd_wlan passwort englisch ' , ' sd_aktivierung uniaccount ' ,
' sd_spam e-mail bekannt meldung ' , ' sd_wlan_beratung ' , ' ub_namensaenderung ' ,
' sd_telefon (antrag: neuanschluss, umzug, aenderung erledigt) ' , ' sd_unicard_abholung ' ,
' sd_uniaccount_dauer freischaltung ' , ' sd_uniaccount activation englisch ' , ' sd_unicard_max_laufzeit ' ,
' sd_unicard_workaround_bestellung ' , ' sd_sap_firefox_esr ' , ' sap portal " im anderen modus geoeffnet " ' ,
' sd_origin nur noch eine seriennummer ' , ' sd_login_unibib ub-it ' ]
BLACKLIST = get_list_from_config ( " preprocessing " , " custom_words " )
WHITELIST = WHITELIST + kb_cats + kb_keys + kb_subjs
DE_STOP_WORDS = load_obj ( path2DEstopwordlist )
VORNAMEN = load_obj ( path2firstnameslist )
LEMMAS = load_obj ( path2lemmadict )
THESAURUS = load_obj ( path2thesaurus_dict )
###### vectorize corpus
logprint ( " vectorize corpus " )
vectorizer = Vectorizer ( weighting = weighting , min_df = min_df , max_df = max_df ) #fehler norma() kickt tokens raus, also bezieht sich min/max_df nicht mehr auf cleaned
terms_list_gen = ( doc . to_terms_list ( ngrams = ngrams , as_strings = True , normalize = norma ) for doc in cleaned_corpus )
doc_term_matrix = vectorizer . fit_transform ( terms_list_gen )
id2term = vectorizer . id_to_term
term2id = vectorizer . vocabulary
logprint ( " corpus vectorized " )
# write labeled_lines.txt
line_gen = gen_lines ( doc_term_matrix , term2id , cleaned_corpus )
lines_txt = " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/labled_lines.txt "
textacy . fileio . write_file_lines ( line_gen ,
filepath = lines_txt )
logprint ( " labled_lines.txt written " )
#### ticket2label_cat
# get all categrories
"""
labelist_cat = [ ]
for doc in cleaned_corpus :
category = normalize_str ( doc . metadata [ " categoryName " ] )
labelist_cat . append ( category )
"""
labelist_cat = [ normalize_str ( doc . metadata [ " categoryName " ] ) for doc in cleaned_corpus ]
#create label_dict
labeldict_cat = create_labeldict ( labelist_cat , min_label_freq = 1 , add_default_label = True )
# map tickets to labels
"""
ticket2label_cat = { }
for doc in cleaned_corpus :
ticketID = doc . metadata [ " TicketNumber " ]
category_name_ = doc . metadata [ " categoryName " ]
label_num = labeldict_cat . get ( category_name_ , labeldict_cat [ ' DEFAULT ' ] )
ticket2label_cat . update ( { ticketID : label_num } )
"""
ticket2label_cat = { doc . metadata [ " TicketNumber " ] : labeldict_cat . get ( doc . metadata [ " categoryName " ] , labeldict_cat [ ' DEFAULT ' ] ) for doc in cleaned_corpus }
##################################################################################################
kb2keywords_dict , kb2subjects_dict , ticket2kbs_dict , ticket2keywords_dict , ticket2subjects_dict = create_ticket2sth_dicts ( )
#### ticket2label_keys
ticket2label_keys = create_ticket2label_dict ( ticket2keywords_dict , cleaned_corpus )
"""
labelist_keys = ticket2keywords_dict . values ( )
labelist_keys = flatten ( labelist_keys )
labeldict_keys = create_labeldict ( labelist_keys , min_label_freq = 1 , add_default_label = True )
ticket2label_keys = { }
for doc in cleaned_corpus :
ticketID = doc . metadata [ " TicketNumber " ]
keywords = ticket2keywords_dict . get ( ticketID , [ ' DEFAULT ' ] )
label = " "
for kw in keywords :
label = label + str ( labeldict_keys . get ( normalize_str ( str ( kw ) ) , labeldict_keys [ ' DEFAULT ' ] ) ) + " "
ticket2label_keys . update ( { ticketID : label } )
"""
##################################################################################################
#### ticket2label_subjs
ticket2label_subjs = create_ticket2label_dict ( ticket2subjects_dict , cleaned_corpus )
2017-12-11 12:10:40 +01:00
"""
2017-12-19 17:12:35 +01:00
labelist_subjs = ticket2subjects_dict . values ( )
labelist_subjs = flatten ( labelist_subjs )
labeldict_subjs = create_labeldict ( labelist_subjs , min_label_freq = 1 , add_default_label = True )
2017-12-11 12:10:40 +01:00
2017-12-19 17:12:35 +01:00
ticket2label_subjs = { }
for doc in cleaned_corpus :
ticketID = doc . metadata [ " TicketNumber " ]
2017-12-08 11:06:07 +01:00
2017-12-19 17:12:35 +01:00
keywords = ticket2subjects_dict . get ( ticketID , [ ' DEFAULT ' ] )
2017-12-08 11:06:07 +01:00
2017-12-19 17:12:35 +01:00
label = " "
for kw in keywords :
label = label + str ( labeldict_subjs . get ( normalize_str ( str ( kw ) ) , labeldict_subjs [ ' DEFAULT ' ] ) ) + " "
ticket2label_subjs . update ( { ticketID : label } )
"""
2017-12-08 11:06:07 +01:00
2017-12-19 17:12:35 +01:00
#### ticket2label_kb
ticket2label_kb = create_ticket2label_dict ( ticket2kbs_dict , cleaned_corpus )
2017-12-08 11:06:07 +01:00
2017-12-19 17:12:35 +01:00
"""
labelist_kbs = ticket2kbs_dict . values ( )
labelist_kbs = flatten ( labelist_kbs )
labeldict_kbs = create_labeldict ( labelist_kbs , min_label_freq = 1 , add_default_label = True )
ticket2label_kb = { }
for doc in cleaned_corpus :
ticketID = doc . metadata [ " TicketNumber " ]
keywords = ticket2kbs_dict . get ( ticketID , [ ' DEFAULT ' ] )
label = " "
for kw in keywords :
label = label + str ( labeldict_kbs . get ( normalize_str ( str ( kw ) ) , labeldict_kbs [ ' DEFAULT ' ] ) ) + " "
ticket2label_kb . update ( { ticketID : label } )
"""
2017-12-08 11:06:07 +01:00
2017-12-19 17:12:35 +01:00
def relabele_lines ( file , ticket2label_dict ) :
2017-12-08 11:06:07 +01:00
2017-12-19 17:12:35 +01:00
line_gen = textacy . fileio . read_file_lines ( file )
2017-12-08 11:06:07 +01:00
2017-12-19 17:12:35 +01:00
for line in line_gen :
label = re . findall ( r ' \ [(.*?) \ ] ' , line )
2017-10-16 14:01:38 +02:00
2017-12-19 17:12:35 +01:00
new_label = " [ "
for lbl in label :
new_label = new_label + str ( ticket2label_dict . get ( str ( lbl ) , " " ) ) . strip ( ) + " "
2017-10-16 14:01:38 +02:00
2017-12-19 17:12:35 +01:00
new_label = new_label + " ] "
result = new_label + str ( line . rpartition ( " ] " ) [ 2 ] )
2017-10-16 14:01:38 +02:00
2017-12-19 17:12:35 +01:00
# new_label = str([ticket2label_dict.get(str(lbl),"") for lbl in label])
2017-10-16 14:01:38 +02:00
2017-12-19 17:12:35 +01:00
# result = "[ " + new_label + " ] " + line.rpartition("]")[2]
# print(result)
2017-12-08 11:06:07 +01:00
2017-12-19 17:12:35 +01:00
yield result
2017-12-08 11:06:07 +01:00
2017-12-19 17:12:35 +01:00
lines_sub = " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/labled_lines_sub.txt "
generator = relabele_lines ( lines_txt , ticket2subjects_dict )
textacy . fileio . write_file_lines ( generator , lines_sub )
2017-10-16 14:01:38 +02:00
2017-12-19 17:12:35 +01:00
labelset , corpus , labels = load_corp ( lines_sub )
2017-10-16 14:01:38 +02:00
2017-12-19 17:12:35 +01:00
K = 20 #Number of topics, ist egal, wird in implementierung nicht verwertet
alpha = 0.001
beta = 0.001
number_of_iterations = 10
2017-12-11 12:10:40 +01:00
2017-10-16 14:01:38 +02:00
2017-12-19 17:12:35 +01:00
llda = LLDA ( K , alpha , beta )
llda . set_corpus ( labelset , corpus , labels )
2017-10-16 14:01:38 +02:00
2017-12-19 17:12:35 +01:00
for i in range ( number_of_iterations ) :
llda . inference ( )
2017-10-16 14:01:38 +02:00
2017-12-19 17:12:35 +01:00
phi = llda . phi ( )
# print(llda.vocas)
2017-10-16 14:01:38 +02:00
2017-12-19 17:12:35 +01:00
# for v, voca in enumerate(llda.vocas):
# print ','.join([voca]+[str(x) for x in llda.n_z_t[:,v]])
# print(','.join([voca] + [str(x) for x in phi[:, v]]))
2017-10-16 14:01:38 +02:00
2017-10-25 09:46:44 +02:00
2017-10-16 14:01:38 +02:00
2017-12-19 17:12:35 +01:00
################# termite plot ######################################
topic_labels = list ( labelset )
term_labels = list ( llda . vocas )
term_topic_weights = phi . transpose ( )
threshmin = 0.05
thresholded = threshold ( term_topic_weights , threshmin = threshmin )
draw . draw_termite ( thresholded , topic_labels , term_labels , save = " test.png " )
#jgibbsLLDAv2("corpi/labled_lines.txt",ticket2kbs_dict,cleaned_corpus,"results")
return doc_term_matrix , id2term
def create_ticket2label_dict ( ticket2chunk_dict , corpus ) :
"""
Creates a dictionary to map a TicketNumber to a label
: param ticket2chunk_dict : e . g . { TicketNumber : KB_entries }
: return : { TicketNumber : label }
"""
labelist = ticket2chunk_dict . values ( )
labelist = flatten ( labelist )
labeldict = create_labeldict ( labelist , min_label_freq = 1 , add_default_label = True )
ticket2label = { }
for doc in corpus :
ticketID = doc . metadata [ " TicketNumber " ]
keywords = ticket2chunk_dict . get ( ticketID , [ ' DEFAULT ' ] )
label = " "
for kw in keywords :
label = label + str ( labeldict . get ( normalize_str ( str ( kw ) ) , labeldict [ ' DEFAULT ' ] ) ) + " "
ticket2label . update ( { ticketID : label } )
return ticket2label
def create_labeldict ( labelist , min_label_freq = 1 , add_default_label = True ) :
# nur die x häufigsten labels benutzen
labelist = [ l for l in labelist if labelist . count ( l ) > = min_label_freq ]
in_labelist_ = { k : labelist . count ( k ) for k in labelist } # { label1 : 3 , label2 : 5, label3 : 1 }
labelist = sort_dictionary ( in_labelist_ ) # [ (label3, 1), (label1, 3), (label2, 5) ]
labelist . reverse ( ) # [ (label2, 5), (label1, 3), (label3, 1) ]
labeldict = { elem [ 0 ] : i for i , elem in enumerate ( labelist ) } # { label2 : 0, label1 : 1 , label3 : 2 }
if add_default_label :
if ' DEFAULT ' not in labeldict . keys ( ) :
labeldict . update ( { ' DEFAULT ' : len ( labelist ) } ) # { label2 : 0, label1 : 1 , label3 : 2 , DEFAULT : 3 }
return labeldict
def create_ticket2sth_dicts ( ) :
"""
Return : { str : [ str ] }
kb2keywords_dict { ' KBA10230 ' : [ ' DEFAULT ' ] , ' KBA10129 ' : [ ' DEFAULT ' ] , ' KBA10287 ' : [ ' sd_ansys_informationen ' ] , } len = 260
kb2subjects_dict { ' KBA10230 ' : [ ' unicard nochmal beantragen ' ] , ' KBA10129 ' : [ ' sd_entsperrung unicard nach verlust/wiederfinden ' ] , } len = 260
ticket2kbs_dict { ' INC44526 ' : [ ' KBA10056 ' ] , ' INC67205 ' : [ ' KBA10056 ' ] , } len = 4832
ticket2keywords_dict { ' INC44526 ' : [ ' DEFAULT ' ] , ' INC67205 ' : [ ' DEFAULT ' ] , ' INC71863 ' : [ ' DEFAULT ' ] , ' INC44392 ' : [ ' asknet ' ] } len = 4832
ticket2subjects_dict { ' INC44526 ' : [ ' sd_telefon (antrag: neuanschluss, umzug, aenderung erledigt) ' ] , len = 4832
"""
# kb2keywords_dict / kb2subjects_dict --> {str : [str]}
kb2keywords_dict = { }
kb2subjects_dict = { }
kb_gen = textacy . fileio . read_csv ( FILEPATH + " M42-Export/KB_2017-09-13.csv " , delimiter = " ; " )
next ( kb_gen , None ) # skip first line "ArticleID";"Subject";"Keywords";...
for line in kb_gen :
kb_id = line [ 0 ]
subject = normalize_str ( line [ 1 ] )
keywords = [ normalize_str ( x ) for x in str ( line [ 2 ] ) . split ( " , " ) ]
if kb_id not in kb2keywords_dict . keys ( ) :
kb2keywords_dict [ kb_id ] = keywords if keywords != [ ' ' ] else [ " DEFAULT " ]
else :
kb2keywords_dict [ kb_id ] = kb2keywords_dict [ kb_id ] + keywords
if kb_id not in kb2subjects_dict . keys ( ) :
kb2subjects_dict [ kb_id ] = [ normalize_str ( subject ) if subject != [ ' ' ] else " DEFAULT " ]
else :
kb2subjects_dict [ kb_id ] . append ( normalize_str ( subject ) )
# ticket2kbs_dict --> {str : [str]}
ticket2kbs_dict = { }
kb2ticket_gen = textacy . fileio . read_csv ( FILEPATH + " M42-Export/KB2Ticket_2017-09-13.csv " , delimiter = " ; " )
next ( kb2ticket_gen , None ) # skip first line "TicketNumber";"ArticleID"
for line in kb2ticket_gen :
ticket_id = line [ 0 ]
kb_id = line [ 1 ]
if ticket_id not in ticket2kbs_dict . keys ( ) :
ticket2kbs_dict [ ticket_id ] = [ kb_id ]
else :
ticket2kbs_dict [ ticket_id ] . append ( kb_id )
# ticket2keywords --> {str:[str]}
ticket2keywords_dict = { }
for ticket_id , kb_ids in ticket2kbs_dict . items ( ) :
if ticket_id not in ticket2keywords_dict . keys ( ) :
ticket2keywords_dict [ ticket_id ] = [ ]
for kb_id in kb_ids :
ticket2keywords_dict [ ticket_id ] . append ( kb2keywords_dict [ kb_id ] )
ticket2keywords_dict [ ticket_id ] = flatten ( ticket2keywords_dict [ ticket_id ] )
# ticket2subjects --> {str:[str]}
ticket2subjects_dict = { }
for ticket_id , kb_ids in ticket2kbs_dict . items ( ) :
if ticket_id not in ticket2subjects_dict . keys ( ) :
ticket2subjects_dict [ ticket_id ] = [ ]
for kb_id in kb_ids :
ticket2subjects_dict [ ticket_id ] . append ( kb2subjects_dict [ kb_id ] )
ticket2subjects_dict [ ticket_id ] = flatten ( ticket2subjects_dict [ ticket_id ] )
"""
count_dict = { }
for v in ticket2kbs_dict . values ( ) :
for kb in v :
if kb in count_dict . keys ( ) :
count_dict [ kb ] + = 1
else :
count_dict [ kb ] = 1
sorted_dict = sorted ( count_dict . items ( ) , key = operator . itemgetter ( 1 ) )
for k , v in sorted_dict :
subs = kb2subjects_dict [ k ]
keys = kb2keywords_dict [ k ]
print ( subs , keys , v ) # frage wieviele tickets pro topic?
print ( " kb_entrys used: {} " . format ( len ( sorted_dict ) ) ) # frage wie viele kb_entry's insg genutzt?: 155
"""
return kb2keywords_dict , kb2subjects_dict , ticket2kbs_dict , ticket2keywords_dict , ticket2subjects_dict
#labelist = ticket2keywords_dict.values()
#labelist = flatten(labelist)
#labelist = list(set(labelist))
#labeldict = {k: v for v, k in enumerate(labelist)}
##############################################################################################
def gen_lines ( doc_term_matrix , term2id , corpus , label = " TicketNumber " ) :
for i , doc in enumerate ( corpus ) :
line = " [ " + doc . metadata [ label ] + " ] "
for term , id_ in term2id . items ( ) :
if doc_term_matrix [ i , id_ ] != 0 :
term = term if len ( term . split ( ) ) == 1 else " _ " . join ( term . split ( ) )
line = line + " " + term
if len ( line ) != 0 :
yield line
else :
continue
def main ( cleaned_corpus ) :
2017-10-17 10:13:49 +02:00
start = time . time ( )
2017-10-16 14:01:38 +02:00
2017-10-30 12:56:52 +01:00
2017-12-08 11:06:07 +01:00
"""
2017-10-18 17:37:20 +02:00
filter_tokens = [
keepNouns ( NOUNS ) ,
2017-10-16 14:01:38 +02:00
2017-11-27 12:49:05 +01:00
removeWords ( DE_STOP_WORDS + custom_words + VORNAMEN ) ,
2017-10-18 17:37:20 +02:00
removePOS ( [ " PUNCT " , " SPACE " , " NUM " ] ) ,
2017-11-27 12:49:05 +01:00
#todo STELLSCHRAUBE remove_words_containing_Numbers(),
#todo STELLSCHRAUBE remove_long_words(),
#todo STELLSCHRAUBE remove_short_words()
2017-10-18 17:37:20 +02:00
]
2017-12-19 17:12:35 +01:00
2017-10-18 17:37:20 +02:00
clean_in_meta = {
" Solution " : [ removePOS ( [ " SPACE " ] ) ] ,
" Subject " : [ removePOS ( [ " SPACE " , " PUNCT " ] ) ] ,
" categoryName " : [ removePOS ( [ " SPACE " , " PUNCT " ] ) ]
}
2017-12-19 17:12:35 +01:00
"""
2017-10-18 17:37:20 +02:00
2017-10-25 09:46:44 +02:00
2017-12-08 11:06:07 +01:00
2017-12-19 17:12:35 +01:00
doc_term_matrix , id2term_dict = preprocessCorpus ( cleaned_corpus )
2017-12-08 11:06:07 +01:00
2017-12-19 17:12:35 +01:00
end = time . time ( )
logprint ( " Time Elapsed Preprocessing: {0} min " . format ( ( end - start ) / 60 ) )
return doc_term_matrix , id2term_dict
2017-12-08 11:06:07 +01:00
2017-10-25 09:46:44 +02:00
2017-10-16 14:01:38 +02:00
if __name__ == " __main__ " :
2017-12-19 17:12:35 +01:00
logprint ( " Load Corpus... " )
corpus_name = " de_clean_small " # _small
cleaned_corpus , parser = load_corpus ( corpus_path = " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/ " , corpus_name = corpus_name )
2017-12-08 11:06:07 +01:00
2017-12-19 17:12:35 +01:00
logprint ( " ... Done " )
2017-12-08 11:06:07 +01:00
2017-12-19 17:12:35 +01:00
main ( cleaned_corpus )
2017-10-16 14:01:38 +02:00
2017-10-10 14:42:09 +02:00
"""
pipe = [
2017-09-11 13:00:03 +02:00
2017-10-10 14:42:09 +02:00
##String
2017-09-11 13:00:03 +02:00
2017-10-10 14:42:09 +02:00
fixUnicode ( ) ,
replaceHardS ( ) ,
resolveAbbrivations ( ) ,
2017-09-11 13:00:03 +02:00
2017-10-10 14:42:09 +02:00
remove_words_containing_topLVL ( ) ,
2017-09-11 13:00:03 +02:00
2017-10-10 14:42:09 +02:00
replaceSpecialChars ( " " ) , ( mit Leerzeichen erstzen , dadruch werden Terme wie 8203 ; verfügung getrennt
2017-09-11 13:00:03 +02:00
2017-10-10 14:42:09 +02:00
remove_words_containing_Numbers ( ) ,
2017-08-31 14:54:01 +02:00
2017-10-10 14:42:09 +02:00
##spacyParse
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
removeENT ( " PERSON " ) ,
keepPOS ( [ " NOUN " ] ) ,
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
#ODER
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
lemmatize ( ) ,
removeWords ( de_stop_words + config . get ( " preprocessing " , " custom_words " ) . split ( " , " ) ) ,
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
# evtl.
spellCorrection ( ) ,
keepUniqeTokens ( ) ,
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
]
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
"""
2017-10-16 14:01:38 +02:00
"""
filter_tokens = [
#removeENT(["PERSON"]),
#idee addressen enfernen #bisher mit cut_after("gruss") --> postal.parser
#idee rechtschreibkorrektur --> PyEnchant
#idee thesaurus --> WordNet, eigener
2017-08-29 15:01:17 +02:00
2017-10-16 14:01:38 +02:00
remove_words_containing_Numbers ( ) ,
2017-08-29 15:01:17 +02:00
2017-10-16 14:01:38 +02:00
removePOS ( [ " PUNCT " , " SPACE " , " NUM " ] ) ,
2017-08-29 15:01:17 +02:00
2017-10-16 14:01:38 +02:00
removeWords ( de_stop_words + custom_words ) ,
2017-08-29 15:01:17 +02:00
2017-10-16 14:01:38 +02:00
remove_long_words ( ) ,
remove_short_words ( ) ,
remove_first_names ( ) ,
2017-08-29 15:01:17 +02:00
2017-10-16 14:01:38 +02:00
keepPOS ( [ " NOUN " ] ) ,
2017-08-29 15:01:17 +02:00
2017-10-16 14:01:38 +02:00
]
"""