2017-09-11 17:29:54 +02:00
# -*- coding: utf-8 -*-
2017-11-17 11:46:57 +01:00
import matplotlib
matplotlib . use ( ' Agg ' )
import time
2017-09-15 14:32:44 +02:00
2017-11-17 11:46:57 +01:00
import textacy
2017-09-15 14:32:44 +02:00
2017-11-17 11:46:57 +01:00
import numpy as np
start = time . time ( )
import json
import os . path
import subprocess
from textacy import Vectorizer , viz
2017-09-15 14:32:44 +02:00
2017-11-17 11:46:57 +01:00
from miscellaneous import *
import textacy
from scipy import *
import os
2017-11-06 12:54:59 +01:00
import json
2017-11-17 11:46:57 +01:00
FILEPATH = os . path . dirname ( os . path . realpath ( __file__ ) ) + " / "
import draw
2017-09-13 12:53:09 +02:00
2017-09-14 11:40:00 +02:00
2017-11-21 10:14:37 +01:00
# load corpus
corpus_de_path = FILEPATH + config . get ( " de_corpus " , " path " )
preCorpus_name = " de " + " _pre_ticket "
corpus , parser = load_corpus ( corpus_name = preCorpus_name , corpus_path = corpus_de_path )
logprint ( " Corpus loaded: {0} " . format ( corpus . lang ) )
2017-11-27 12:49:05 +01:00
2017-11-21 10:14:37 +01:00
#todo randomize
2017-09-14 11:40:00 +02:00
2017-11-27 12:49:05 +01:00
split = 0.8
weighting = " tf "
min_df = 0
max_df = 1
ngrams = 1
n_topics = 3
top_n = 7
split_index = int ( float ( len ( corpus ) ) * split )
2017-11-21 10:14:37 +01:00
corpus_train = corpus [ 0 : split_index ]
corpus_test = corpus [ split_index : len ( corpus ) - 1 ]
2017-11-17 11:46:57 +01:00
2017-11-27 12:49:05 +01:00
###### Initialize and train a topic model
vectorizer = Vectorizer ( weighting = weighting , min_df = min_df , max_df = max_df )
terms_list = ( doc . to_terms_list ( ngrams = ngrams , named_entities = False , as_strings = True ) for doc in corpus_train )
doc_term_matrix = vectorizer . fit_transform ( terms_list )
id2term = vectorizer . __getattribute__ ( " id_to_term " )
model = textacy . tm . TopicModel ( " lda " , n_topics = n_topics )
model . fit ( doc_term_matrix )
######
compenents = model . model . components_
"""
components_ : array , [ n_components , n_features ]
Variational parameters for topic word distribution .
Since the complete conditional for topic word distribution is a Dirichlet ,
components_ [ i , j ] can be viewed as pseudocount that represents
the number of times word j was assigned to topic i .
It can also be viewed as distribution over the words for each topic after normalization :
model . components_ / model . components_ . sum ( axis = 1 ) [ : , np . newaxis ] .
"""
test_doc = corpus_test [ 0 ]
end = time . time ( )
print ( " \n \n \n Time Elapsed Test: {0} \n \n " . format ( end - start ) )
"""
2017-11-21 10:14:37 +01:00
# frage wieviele tickets pro topic?
2017-11-17 11:46:57 +01:00
2017-11-27 12:49:05 +01:00
ticket_gen = textacy . fileio . read_csv ( FILEPATH + " M42-Export/de_tickets.csv " , delimiter = " ; " )
cat_dict = { }
cat2id_dict = { }
for line in ticket_gen :
tick_id = line [ 0 ]
cat = normalize ( line [ 3 ] )
cat2id_dict [ cat ] = tick_id
if cat not in cat_dict . keys ( ) :
cat_dict [ cat ] = 1
else :
cat_dict [ cat ] + = 1
import operator
sorted_dict = sorted ( cat_dict . items ( ) , key = operator . itemgetter ( 1 ) )
for k , v in sorted_dict :
if k == " sd " :
print ( cat2id_dict [ k ] )
print ( k , v )
print ( len ( sorted_dict ) )
2017-11-21 10:14:37 +01:00
kb2ticket_gen = textacy . fileio . read_csv ( FILEPATH + " M42-Export/KB2Ticket_2017-09-13.csv " , delimiter = " ; " )
2017-11-17 11:46:57 +01:00
2017-11-21 10:14:37 +01:00
ticket2kb_dict = { }
2017-11-17 11:46:57 +01:00
2017-11-21 10:14:37 +01:00
for line in kb2ticket_gen :
2017-11-17 11:46:57 +01:00
2017-11-21 10:14:37 +01:00
ticket_id = line [ 0 ]
kb_id = line [ 1 ]
2017-11-17 11:46:57 +01:00
2017-11-21 10:14:37 +01:00
ticket2kb_dict [ ticket_id ] = kb_id
# {'INC55646': 'KBA10065', 'INC65776': 'KBA10040', 'INC43025': 'KBA10056', ...} # kb2keywords_dict
2017-11-17 11:46:57 +01:00
2017-11-21 10:14:37 +01:00
kb2keywords_gen = textacy . fileio . read_csv ( FILEPATH + " M42-Export/KB_2017-09-13.csv " , delimiter = " ; " )
2017-11-17 11:46:57 +01:00
2017-11-21 10:14:37 +01:00
next ( kb2keywords_gen , None ) # skip first line("ArticleID";"Subject";"Keywords";...)
2017-11-17 11:46:57 +01:00
2017-11-21 10:14:37 +01:00
kb2keywords_dict = { }
2017-11-17 11:46:57 +01:00
2017-11-21 10:14:37 +01:00
kb_keywords = False
2017-11-17 11:46:57 +01:00
2017-11-21 10:14:37 +01:00
for line in kb2keywords_gen :
2017-11-17 11:46:57 +01:00
2017-11-21 10:14:37 +01:00
kb_id = line [ 0 ]
2017-11-17 11:46:57 +01:00
2017-11-21 10:14:37 +01:00
subject = line [ 1 ]
2017-11-17 11:46:57 +01:00
2017-11-21 10:14:37 +01:00
keywords = line [ 2 ]
keywords_list = [ normalize ( x ) for x in str ( keywords ) . split ( " , " ) ]
2017-11-17 11:46:57 +01:00
2017-11-21 10:14:37 +01:00
if kb_id not in kb2keywords_dict . keys ( ) :
kb2keywords_dict [ kb_id ] = [ ]
2017-11-17 11:46:57 +01:00
2017-11-21 10:14:37 +01:00
if kb_keywords :
for item in keywords_list :
if item != " " :
kb2keywords_dict [ kb_id ] . append ( item )
2017-11-17 11:46:57 +01:00
2017-11-21 10:14:37 +01:00
else :
kb2keywords_dict [ kb_id ] . append ( subject )
2017-11-17 11:46:57 +01:00
2017-11-21 10:14:37 +01:00
# remove all empty items
kb2keywords_dict = { k : v for k , v in kb2keywords_dict . items ( ) if len ( v ) != 0 }
# {'KBA10091': ['citavi'], 'KBA10249': ['"beschaedigte unicard"', 'risse', '"defekte karte"'], ...}
cat_dict = { }
count_dict = { }
keywords_dict = { }
for doc in corpus :
category_name_ = doc . metadata [ " categoryName " ]
if category_name_ not in cat_dict . keys ( ) :
cat_dict [ category_name_ ] = 1
else :
cat_dict [ category_name_ ] + = 1
try :
x = doc . metadata [ " TicketNumber " ]
x = ticket2kb_dict [ x ]
x = kb2keywords_dict [ x ]
except :
pass
for k , v in kb2keywords_dict . items ( ) : #str,list
for elem in v :
if elem not in count_dict . keys ( ) :
count_dict [ elem ] = 1
else :
count_dict [ elem ] + = 1
import operator
kb2keywords_gen = textacy . fileio . read_csv ( FILEPATH + " M42-Export/KB_2017-09-13.csv " , delimiter = " ; " )
next ( kb2keywords_gen , None ) # skip first
cnt = 0
for kb in kb2keywords_gen :
cnt + = 1
print ( str ( cnt ) )
count_dict = { }
# "ArticleID";"Subject";"Keywords";"Solution";"SolutionText";"CreatedOn"
for kb_entry in kb2keywords_gen :
entry_ = kb_entry [ 1 ]
if entry_ not in count_dict . keys ( ) :
count_dict [ entry_ ] = 1
else :
count_dict [ entry_ ] + = 1
2017-11-27 12:49:05 +01:00
import operator
2017-11-21 10:14:37 +01:00
sorted_dict = sorted ( count_dict . items ( ) , key = operator . itemgetter ( 1 ) )
#for k,v in sorted_dict:
# print(k,v)
#print(len(sorted_dict))
2017-11-27 12:49:05 +01:00
"""
2017-11-17 11:46:57 +01:00
2017-11-21 10:14:37 +01:00
"""
# kb2keywords_dict
kb2keywords_gen = textacy . fileio . read_csv ( FILEPATH + " M42-Export/KB2Ticket_2017-09-13.csv " ,
delimiter = " ; " )
used_kb = [ ]
for kb in kb2keywords_gen :
used_kb . append ( kb [ 1 ] )
print ( " used_kb: {} " . format ( len ( list ( set ( used_kb ) ) ) ) )
#"ArticleID";"Subject";"Keywords";"Solution";"SolutionText";"CreatedOn"
kb2keywords_gen = textacy . fileio . read_csv ( FILEPATH + " M42-Export/KB_2017-09-13.csv " , #
delimiter = " ; " )
next ( kb2keywords_gen , None ) # skip first
cat_lst = [ ]
sub_lst = [ ]
key_lst = [ ]
for kb in kb2keywords_gen :
cat_lst . append ( kb [ 0 ] )
sub_lst . append ( kb [ 1 ] )
key_lst . append ( kb [ 2 ] . split ( " , " ) )
cats_setlist = list ( set ( cat_lst ) )
print ( " cats: {} " . format ( len ( cats_setlist ) ) )
print ( cats_setlist [ 0 : 20 ] )
print ( )
print ( " sub_lst: {} " . format ( len ( sub_lst ) ) )
sub_setlist = list ( set ( sub_lst ) ) #frage: hat wirklich jeder kb_eintrag ein anderesn Betreff?
print ( " sub_setlist: {} " . format ( len ( sub_setlist ) ) )
#print(sub_setlist[0:20])
print ( )
key_lst = [ item for sublist in key_lst for item in sublist ] #flatten list
key_setlist = list ( set ( key_lst ) )
print ( " key_setlist: {} " . format ( len ( key_setlist ) ) )
#print(key_setlist[0:20])
print ( " \n \n \n \n " )
"""
"""
used_list = [ ' bd_unicard_nicht_eingeschrieben ' , ' sd_vpn_temporaerer fehler ub ' , ' sd_webmailer_threadanzeige und weiterleitung ' , ' ub_beschaedigte unicard ' , ' sd_boss_notenverbuchung ' , ' d.3 client installation ' , ' sd_keine rueckantwort kunde ' , ' sd_asknet_und_dreamspark ' , ' sd_beantragung_unicard ' , ' sd_gastaufenthalter ' , ' sd_internationaloffice ' , ' sd_outlook anmeldung gestoert ' , ' unicard_restbetrag_auszahlung ' , ' apps_dms_d.3 client installation/login d.3 funktioniert nicht ' , ' ub_unicard_unicard mit vollmacht abholen ' , ' sd_namensaenderung mitarbeiter ' , ' sd_itmc kurse anmeldebestaetigung ' , ' sd_zugriff_onlinedienste_rueckmeldung ' , ' benutzer zum redmine hinzufuegen ' , ' sd_unicard_gesperrte unicard entsperre ' , ' lsf freischaltung als mitarbeiter/in ' , ' sd_mail_als_anhang ' , ' sd-e-mail_adresse_funktional_beantragen ' , ' sd_goeke drucker ' , ' sd_unimail imap_pop3 ' , ' sd_origin_workaround ' , ' sd_matlab lizenzdatei pc-pools ' , ' sd_outlook kontakte automatische aktualisierung ' , ' sd_sap konteneinsicht antrag ' , ' ohne betreff ' , ' sd_telefonantrag_änderung_neuantrag ' , ' sd_sophos download ' , ' sd_geraeteausleihe ' , ' studierendenausweis ' , ' sd_citavi ' , ' sd_laufzeit unimail account ' , ' sd_login_unibib ub-it ' , ' sd_tu_app_keine internetverbindung ' , ' sd_unicard_max_laufzeit ' , ' ub_unicard_zusendung der karte moeglich? ' , ' sd_telefonbuch-eintrag_änderung ' , ' ub_drucker kopierer ' , ' windows 10 ' , ' sd_telefon (antrag: neuanschluss, umzug, änderung erledigt) ' , ' sd_tu-app feedback standard ' , ' sd_spam e-mail bekannt meldung ' , ' sd_spss_online_bestellung ' , ' sd_apple-on-campus ' , ' sd_studisek ' , ' sd_office 365 plus support ' , ' sd_sap_initialkennwort_englisch ' , ' sd_office365_asknet ' , ' re: elektroarbeiten fuer leitsystem 2. und 3. obergeschoss ' , ' sd_login tu portale ' , ' ungueltiges ticket siehe journal ' , ' sd_sap_freischaltung ohne passwortaenderung ' , ' bd_unicard_geldkarte_laden ' , ' sd_verlust/antrag unicard ' , ' sd_unimail zu exchange ' , ' citavi_lizenzschluessel_nicht bekommen ' , ' sd_plotauftrag_zv ' , ' sd_citavi_support ' , ' sd_antworten_korrekt ' , ' sd_wlan-gastkonto ' , ' sd_antwort_phishingmail ' , ' bd_unicard_freigabe_beantragung ' , ' sd_origin nur noch eine seriennummer ' , ' cm_asiexception ' , ' sd_login_tu_portale ' , ' sd_webmailer_thread-anzeige ' , ' apps_dms-passwort d.3 ' , ' apps_redmine_repository ' , ' sd_uniaccount_passwortaenderung ' , ' sd_phishing ' , ' sd_sap_firefox_esr ' , ' vpn verbindung fuer unitymedia kunden ' , ' sd_kurs-angebote anmeldung ' , ' sd_unicard fehlerhafte geldbuchung ' , ' sd_uniaccount_ehemalige_passwortaenderung ' , ' sd_sap_dienstreise ' , ' cm_lsf-boss_freischaltung ' , ' wlan ' , ' uni card ' , ' sd_webmailer einrichtung weiterleitung ' , ' spam ohne tu bezug ' , ' sd_outlook_in_exchange_einbinden ' , ' sd_wlan_beratung ' , ' sd_uniaccount_dauer freischaltung ' , ' sd_sap_konteneinsicht_ workaround ' , ' sd_vpn anleitungen ' , ' sd_asknet_mitarbeiter_softwarebestellung ' , ' sd_unicard_abholung ' , ' sd_vpn_probleme_mit_unitymedia ' , ' sd_diensthandy beschaffung ' , ' sd_unicard_defekt ' , ' sd_freischaltung uniaccount verzoegert ' , ' sd_kurs-angebote itmc ' , ' bd_goeke_allgemein ' , ' sd_uniaccount_ehemalige_studierende ' , ' sd_stellenausschreibung schwarzes brett ' , ' freischaltung uniaccount ' , ' sd_unicard_workaround_bestellung ' , ' probleme mit der namensaenderung/ neue unicard ' , ' ub_geldchip-problem bei uc ' , ' sd_semesterticket ' , ' problem mit der beantragung von der unicard ' , ' sd_citavi bestellung ' , ' sd_immatrikulationsbescheigung_druckfehler ' , ' sd_vpn_aktualisierung ' , ' vpn_ipsec_stoerung ' , ' sd_dreamspark ' , ' ub_namensaenderung ' , ' sd_immatrikulationsbescheinigung_portal ' , ' ub_prod_neue unicard bei beschaedigung ' , ' sd_vpn_webvpn ' , ' sd_telefonbuch_prof_eintragung ' , ' sd_kontakt_asknet ' , ' probleme mit unicard ' , ' sd_office 356 plus bestellung ' , ' sd_gmx_web.de ' , ' fehlender eintrag im elektronischen telefonbuch ' , ' ub_prod_namenskorrektur_student ' , ' einrichtung des eduroam netzwerks ' , ' sd_sap_initialkennwort ' , ' sd_boss-bescheinigung ' , ' sd_wlan passwort setzen ' , ' sd_aktivierung uniaccount ' , ' sd_gleitzeitanlage_dez3_stoerung ' , ' sd_heirat_namensaenderung_student ' , ' ub_unicard_spaetere abholung moeglich? ' , ' unicard nochmal beantragen ' , ' sd_studisek_buchung
labellist = [ ' sd_antworten_korrekt ' , ' sd_kurs-angebote anmeldung ' , ' sd_semesterticket ' , ' apps_dms-passwort d.3 ' , ' freischaltung uniaccount ' , ' sd_heirat_namensaenderung_student ' , ' bd_unicard_freigabe_beantragung ' , ' sd_uniaccount_ehemalige_studierende ' , ' sd_sap_dienstreise ' , ' sd_origin_workaround ' , ' sd_uniaccount_ehemalige_passwortaenderung ' , ' fehlender eintrag im elektronischen telefonbuch ' , ' wlan ' , ' sd_tu-app feedback standard ' , ' sd_wlan_beratung ' , ' sd_uniaccount_passwortaenderung ' , ' re: elektroarbeiten fuer leitsystem 2. und 3. obergeschoss ' , ' sd_webmailer_threadanzeige und weiterleitung ' , ' ub_unicard_spaetere abholung moeglich? ' , ' sd_citavi_support ' , ' sd_outlook kontakte automatische aktualisierung ' , ' sd_origin nur noch eine seriennummer ' , ' lsf freischaltung als mitarbeiter/in ' , ' cm_asiexception ' , ' sd_freischaltung uniaccount verzoegert ' , ' ub_unicard_zusendung der karte moeglich? ' , ' sd_login_unibib ub-it ' , ' uni card ' , ' sd_outlook anmeldung gestoert ' , ' d.3 client installation ' , ' ub_unicard_abholungszeiten ' , ' sd_antwort_phishingmail ' , ' sd_matlab lizenzdatei pc-pools ' , ' sd_sap_initialkennwort ' , ' sd_sap_freischaltung ohne passwortaenderung ' , ' sd_spss_online_bestellung ' , ' probleme mit der namensaenderung/ neue unicard ' , ' sd_keine rueckantwort kunde ' , ' sd_unimail imap_pop3 ' , ' sd_beantragung_unicard ' , ' sd_unicard_gesperrte unicard entsperre ' , ' sd_internationaloffice ' , ' unicard nochmal beantragen ' , ' sd_stellenausschreibung schwarzes brett ' , ' sd_sophos download ' , ' cm_lsf-boss_freischaltung ' , ' sd_verlust/antrag unicard ' , ' vpn_ipsec_stoerung ' , ' sd_account_abmelden ' , ' sd_outlook_in_exchange_einbinden ' , ' ub_namensaenderung ' , ' sd_telefon (antrag: neuanschluss, umzug, änderung erledigt) ' , ' unicard vergessen abzuholen und nicht mehr da ' , ' apps_redmine_repository ' , ' einrichtung des eduroam netzwerks ' , ' sd_unicard_max_laufzeit ' , ' sd_gmx_web.de ' , ' sd_unicard fehlerhafte geldbuchung ' , ' sd_geraeteausleihe ' , ' spam ohne tu bezug ' , ' sd_uniaccount_dauer freischaltung ' , ' apps_dms_d.3 client installation/login d.3 funktioniert nicht ' , ' sd_office 365 plus support ' , ' sd_unicard_defekt ' , ' sd_phishing ' , ' sd_goeke drucker ' , ' ub_unicard_unicard mit vollmacht abholen ' , ' sd_gleitzeitanlage_dez3_stoerung ' , ' sd_pruefungsamt ' , ' sd_aktivierung uniaccount ' , ' sd_boss-bescheinigung ' , ' sd_sap_initialkennwort_englisch ' , ' bd_unicard_geldkarte_laden ' , ' sd_telefonbuch-eintrag_änderung ' , ' vpn verbindung fuer unitymedia kunden ' , ' sd_studisek ' , ' sd_antrag funktionale mailadresse ' , ' sd_asknet_und_dreamspark ' , ' sd_unicard_workaround_bestellung ' , ' sd_sap_firefox_esr ' , ' sd_vpn anleitungen ' , ' sd_office365_asknet ' , ' citavi_lizenzschluessel_nicht bekommen ' , ' sd_sap konteneinsicht antrag ' , ' sd_spam e-mail bekannt meldung ' , ' ub_prod_namenskorrektur_student ' , ' ub_beschaedigte unicard ' , ' sd_namensaenderung mitarbeiter ' , ' sd_mail_als_anhang ' , ' benutzer zum redmine hinzufuegen ' , ' sd_login_tu_portale ' , ' sd_email_namensaenderung ' , ' windows 10 ' , ' ungueltiges ticket siehe journal ' , ' sd_vpn_temporaerer fehler ub ' , ' ub_prod_neue unicard bei beschaedigung ' , ' sd_dreamspark ' , ' sd_webmailer einrichtung weiterleitung ' , ' sd_asknet_mitarbeiter_softwarebestellung ' , ' sd_studisek_buchung_semesterbeitrag ' , ' sd_immatrikulationsbescheinigung_portal ' , ' sd_vpn_probleme_mit_unitymedia ' , ' sd-e-mail_adresse_funktional_beantragen ' , ' sd_diensthandy beschaffung ' , ' sd_vpn_webvpn ' , ' sd_laufzeit unimail account ' , ' sd_citavi ' , ' problem mit der beantragung von der unicard ' , ' sd_kurs-angebote itmc ' , ' sd_telefonbuch, neues system ' , ' sd_login tu portale ' , ' sd_wlan passwort setzen ' , ' sd_zugriff_onlinedienste_rueckmeldung ' , ' unicard_restbetrag_auszahlung ' , ' sd_immatrikulationsbescheigung_druckfehler ' , ' bd_unicard_nicht_eingeschrieben ' , ' sd_unimail zu exchange ' , ' sd_wlan-gastkonto ' , ' probleme mit unicard ' , ' sd_telefonbuch_prof_eintragung ' , ' sd_vpn_aktualisierung ' , ' sd_apple-on-campus ' , ' bd_goeke_allgemein ' , ' studierendenausweis ' , ' ub_drucker kopierer ' , ' sd_unicard_abholung ' , ' sd_office 356 plus bestellung ' , ' ohne betreff ' , ' sd_tu_app_keine internetverbindung ' , ' sd_boss_notenverbuchung ' , ' ub_geldchip-problem bei uc
for l in used_list :
if l not in labellist :
print ( l )
print ( len ( used_list ) )
print ( len ( labellist ) )
"""
2017-11-17 11:46:57 +01:00
"""
vllt kategorien in unterkategorien aufteilen
2017-09-11 17:29:54 +02:00
2017-11-17 11:46:57 +01:00
allg :
utf - korregieren , bei sonderzeichen wörter trennen
namen raus , addressen nach grüßen
emails , urls , nummern raus
vllt sogar alles , was ebend jenes enthält ( oder auf . toplvldomain bzw . sonderzeichen enthält oder alles was ein @ enthält
sinnvoller wörter von müll trennen : 8203 ; verfügung
abkürzungen raus : m . a , o . ä .
sinnlose bsp : nr54065467 455 a33c5 tvt ? = - - - - - - problem - - - - - - - -
" \n \n \n Time Elapsed Topic Modeling: {0} \n \n " . format ( end - start ) )
"""
2017-09-12 14:56:11 +02:00
2017-11-06 12:54:59 +01:00
"""
# load config
config_ini = FILEPATH + " config.ini "
2017-09-15 14:32:44 +02:00
2017-09-13 12:53:09 +02:00
config = ConfigParser . ConfigParser ( )
2017-09-15 14:32:44 +02:00
with open ( config_ini ) as f :
2017-09-13 12:53:09 +02:00
config . read_file ( f )
2017-11-06 12:54:59 +01:00
PARSER = spacy . load ( " de " )
2017-09-13 12:53:09 +02:00
2017-11-06 12:54:59 +01:00
corpi = textacy . Corpus ( PARSER )
2017-09-13 12:53:09 +02:00
2017-11-06 12:54:59 +01:00
testcontetn = [
" fdsfdsfsd " ,
" juzdtjlkö " ,
" gfadojplk "
]
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
testmetda = [
{ " categoryName " : " zhb " , " Solution " : " " , " Subject " : " schulungstest " } ,
{ " categoryName " : " neuanschluss " , " Solution " : " subject " , " Subject " : " telephone contract " } ,
{ " categoryName " : " zhb " , " Solution " : " " , " Subject " : " setuji " }
]
2017-09-13 12:53:09 +02:00
2017-11-06 12:54:59 +01:00
def makecontent ( testcontetn ) :
for content in testcontetn :
yield content
2017-09-13 12:53:09 +02:00
2017-11-06 12:54:59 +01:00
def makemeta ( testmetda ) :
for metdata in testmetda :
yield metdata
2017-09-12 14:56:11 +02:00
2017-11-06 12:54:59 +01:00
def corpus2Text ( corpus ) :
for doc in corpus :
yield doc . text
2017-09-13 12:53:09 +02:00
2017-09-11 17:29:54 +02:00
2017-11-06 12:54:59 +01:00
corpi . add_texts (
makecontent ( testcontetn ) ,
makemeta ( testmetda )
)
2017-09-11 17:29:54 +02:00
2017-11-06 12:54:59 +01:00
save_corpus ( corpi , corpus_path = " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/test " , corpus_name = " test " )
2017-09-11 17:29:54 +02:00
2017-11-06 12:54:59 +01:00
bla = " uni mail account adresse woche falsch laufen schicken gerne januar betreff herr nachricht gruesse dezernat liebe datum freitag anfrage dienstag unicard karte abholen defekt bibliothek abholung dezember beantragung status gerne portal email nummer service id vorname prozess dez schauen eg rechner mitarbeiterin benutzerkonto oktober wissenschaftliche projekt fr download hilfskraft verantwortliche link dringend antrag schnelle arbeitsplatz november admin rahmen stand geschickt server outlook ordner bild konto postfach campus hi ueberpruefung sued beste daten freuen semester login benutzer gerne erstellen stelle frage system boss moeglichkeit student schoen spam alias geld vertrag juni ansprechpartner telefon raum einrichtung gebaeude telefonbuch abteilung element eintrag nutzer raum pc gerne lehrstuhl voraus fakultaet verfuegung herzliche drucker erreichen tlaptop kabel problem klaerung url adapter feedback koeln grundsaetzlich kaufmann problem fehler verbindung anhang meldung client netz netzwerk wenden funktionieren liebe mitarbeiter unterstuetzung aktuell herr benoetigt raumplanung gb weber vorab ueckmeldung software lizenz programm kurze urlaub gerne installation dankbar informieren team service problem loesung bestellung verlaengern verteiler alte aendern februar oeffnen update pdf browser notwendig fenster schulung beginn wege nord tkurs frage studierende personen teilnehmer standort gerne herunterladen voraus zusenden ews veranstaltung datei iso text umstellung absender message date html arbeit kaiser erfolgreich thema ablauf art at einfuehrung umfrage cloud zugang zugreifen montag probleme kollegin profil server handy web file ticket drucker einrichten senden nr mittwoch card mitteilen nrw kontakt mail fax universitaet it institut hardware hinweis fakultaet not strasse loeschen liste funktion auftrag zeitraum verwaltung angebot vorgehen entfernen moeglichkeit gefunden benutzername informatik gruppe eingabe nachname chemie dame b. angepasst name schoene abt post zukommen verlaengerung sommersemester fehlen namensaenderung auskunft tu dr prof pruefung herr namen fakultaet bereich lehrstuhl installieren buero ok anschluss maerz theologie notebook herr berechtigung master vorbeikommen passwort anmelden account hilfe helfen uniaccount anmeldung kennwort problem boss zugriff referat screenshot support laufwerk bildschirm super tastatur button auswaehlen "
bla = bla . split ( )
print ( len ( bla ) )
print ( len ( set ( bla ) ) )
print ( )
"""
2017-09-11 17:29:54 +02:00
2017-11-06 12:54:59 +01:00
"""
#save_corpusV2(corpi,corpus_path=corpus_de_path,corpus_name=rawCorpus_name)
2017-09-15 14:32:44 +02:00
2017-11-06 12:54:59 +01:00
#textacy.fileio.write_file_lines(corpus2Text(corpi), filepath=corpus_de_path+"plain.txt")
2017-09-15 14:32:44 +02:00
2017-11-06 12:54:59 +01:00
dict = { " unicard redaktionsteam " : 189 , " kms " : 131 , " itmc_st \u00f6 rungen " : 17 , " benutzerverwaltung_probleme " : 168 , " mailverteiler exchange " : 130 , " beamer " : 70 , " cws_confluence " : 190 , " benutzerverwaltung " : 26 , " sos " : 166 , " virtuelle server " : 116 , " sap " : 7 , " wlan " : 21 , " lsf " : 6 , " gastaufenthalt " : 8 , " umzug " : 5 , " firewall betreuung " : 129 , " ausleihe " : 39 , " fiona " : 10 , " kursplanung " : 195 , " schulungsraum verwaltung " : 200 , " plagiatserkennung " : 32 , " designentwicklung " : 100 , " ub basis it " : 184 , " tsm " : 51 , " backup tsm " : 110 , " raumkalender " : 174 , " veeam " : 149 , " linux bs " : 42 , " hochleistungsrechnen " : 90 , " e learning " : 37 , " h \u00f6 rsaal \u00fc bertragung " : 52 , " sophos " : 88 , " service portal redaktion " : 182 , " verkauf " : 93 , " fk 16 " : 30 , " campus app " : 54 , " dns " : 71 , " kurse " : 196 , " itmc schulungsr \u00e4 ume " : 96 , " leitung " : 91 , " telefon " : 14 , " housing " : 135 , " softwarelizenzen " : 35 , " hcm stammdaten " : 68 , " semesterticket " : 197 , " exchange nutzung " : 33 , " mediendienste " : 167 , " sam spider " : 172 , " pvp " : 27 , " webserver " : 29 , " werkvertr \u00e4 ge " : 158 , " ibz raumbuchung " : 177 , " webmailer " : 126 , " unicard sperrung " : 64 , " cd dvd produktion " : 114 , " lizenzserver " : 92 , " pr \u00fc fungsmanagement " : 38 , " blogs wikis foren " : 87 , " unicard ausgabe " : 161 , " pools " : 157 , " desktop & basisdienste " : 144 , " antrag auf rechnungserstellung " : 193 , " mailalias " : 121 , " evaexam " : 133 , " neuanschluss " : 0 , " mobilfunkvertr \u00e4 ge " : 69 , " ftp server " : 191 , " haustechnik " : 77 , " raumbuchungssysteme " : 186 , " confluence " : 181 , " uniaccount zugangsdaten " : 47 , " itmc medienr \u00e4 ume ef50 " : 171 , " dokoll support " : 128 , " elektronisches telefonbuch " : 3 , " softwareverteilung " : 153 , " overhead projektor " : 104 , " sicherheit " : 145 , " itmc_als " : 48 , " itmc pools " : 160 , " zhb " : 60 , " serversupport " : 101 , " veranstaltungen " : 61 , " fk12 webauftritt " : 138 , " hardware " : 142 , " unicard produktion " : 156 , " telefonkonferenzen " : 170 , " dhcp " : 188 , " zertifikate server dfn " : 139 , " lan " : 1 , " datanet " : 49 , " neuausstattung " : 173 , " moodle " : 16 , " abmeldung " : 13 , " uni mail " : 15 , " medienr \u00e4 ume ef50 " : 117 , " verschiedene aufgaben " : 40 , " zentrale webserver " : 75 , " vorlesungsaufzeichnung " : 152 , " grafik " : 132 , " campus management " : 72 , " hacker angriff " : 46 , " pos " : 23 , " zugangsdaten " : 41 , " serviceportal " : 63 , " ews " : 24 , " voicemail box " : 150 , " service desk itmc " : 74 , " test " : 180 , " beschaffung " : 57 , " bestellung " : 185 , " vpn " : 55 , " app feedback " : 66 , " allgemein " : 134 , " rundmail " : 105 , " telefonabrechnung " : 199 , " limesurvey " : 31 , " unicard " : 28 , " eldorado " : 140 , " uniaccount " : 12 , " plotter " : 125 , " mdm mobile device management " : 120 , " namens \u00e4 nderung " : 43 , " sd " : 84 , " basis applikationen " : 103 , " \u00e4 nderung " : 194 , " fileserver einrichtung " : 187 , " fk14_test " : 154 , " werkst \u00e4 tte " : 179 , " itmc_aufgaben " : 45 , " formulare antr \u00e4 ge " : 81 , " facility " : 192 , " web " : 169 , " asknet " : 136 , " server storage " : 113 , " mail groupware " : 20 , " rektorat -b \u00fc ro " : 178 , " office " : 50 , " werkstoffe lehrstuhl bauwesen " : 59 , " telefonzentrale " : 115 , " verwaltung " : 4 , " netze " : 22 , " beantragung " : 82 , " d.3 dms " : 148 , " redmine projektverwaltung " : 141 , " wsus " : 106 , " lido " : 118 , " rechnerr \u00e4 ume " : 143 , " matrix42_hilfe " : 18 , " boss service desk " : 44 , " konteneinsicht " : 62 , " spam phishing " : 53 , " forensic " : 164 , " fk 12 " : 11 , " benutzungsverwaltung " : 198 , " redmine " : 79 , " basis app " : 85 , " viren " : 95 , " fk12 migration " : 155 , " raumbuchung " : 109 , " virtuelle desktops citrix " : 176 , " outlook_einrichtung " : 123 , " kundenserver " : 137 , " nrw ticket " : 80 , " weiterentwicklung " : 127 , " siport zugangskontrolle " : 98 , " e mail dienste " : 99 , " vorlagenerstellung " : 36 , " video " : 19 , " studierendensekretariat " : 111 , " it sicherheit sic " : 86 , " boss " : 25 , " technik " : 58 , " dokoll pvp " : 112 , " betrieb " : 2 , " v2 campus app feedback " : 151 , " mailverteiler " : 108 , " videoschnitt " : 119 , " fk raumplanung 09 " : 9 , " sap urlaub " : 73 , " keine r \u00fc ckantwort " : 124 , " prozess- und projektmanagement " : 67 , " dienstreise " : 34 , " webgestaltung " : 78 , " schulung " : 175 , " software " : 89 , " medientechnik " : 76 , " servicedesk " : 107 , " service portal " : 94 , " software entwicklung " : 165 , " uniflow " : 159 , " ub_st \u00f6 rungen " : 162 , " fk15 " : 183 , " uhren " :
2017-09-15 14:32:44 +02:00
2017-11-06 12:54:59 +01:00
list = [ ( key , value ) for key , value in dict . items ( ) ]
2017-09-18 16:08:11 +02:00
2017-11-06 12:54:59 +01:00
list . sort ( key = lambda tup : tup [ 1 ] )
"""
2017-11-17 11:46:57 +01:00
2017-11-06 12:54:59 +01:00
"""
from spacy . tokens . doc import Doc as SpacyDoc
2017-09-18 16:08:11 +02:00
2017-11-06 12:54:59 +01:00
filepath = " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/de_clean_ticket_content.bin "
2017-09-18 16:08:11 +02:00
2017-11-06 12:54:59 +01:00
# load parser
parser = spacy . load ( " de " )
2017-09-18 16:08:11 +02:00
2017-11-06 12:54:59 +01:00
corpus_path = " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/ "
2017-09-15 14:32:44 +02:00
2017-11-06 12:54:59 +01:00
stringstorepath = corpus_path + ' de_parser/vocab/strings.json '
with open ( stringstorepath ) as file :
parser . vocab . strings . load ( file )
2017-09-12 14:56:11 +02:00
2017-11-06 12:54:59 +01:00
vocabpath = Path ( corpus_path + ' de_parser/vocab/lexemes.bin ' )
parser . vocab . load_lexemes ( vocabpath )
2017-09-12 14:56:11 +02:00
2017-11-06 12:54:59 +01:00
spacy_vocab = parser . vocab
2017-09-12 14:56:11 +02:00
2017-11-06 12:54:59 +01:00
def readCorpus ( filepath ) :
with open_sesame ( filepath , mode = ' rb ' ) as f :
for bytes_string in SpacyDoc . read_bytes ( f ) :
yield SpacyDoc ( spacy_vocab ) . from_bytes ( bytes_string ) . text
2017-09-11 17:29:54 +02:00
2017-09-18 16:08:11 +02:00
2017-11-06 12:54:59 +01:00
textacy . fileio . write_file_lines ( readCorpus ( filepath ) , " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/result.txt " )
"""
2017-09-18 16:08:11 +02:00
2017-11-06 12:54:59 +01:00
# load raw corpus and create new one
# raw_corpus, parser = load_corpusV2(corpus_name=rawCorpus_name, corpus_path=corpus_de_path)
2017-09-13 12:53:09 +02:00
2017-11-06 12:54:59 +01:00
# printRandomDoc(raw_corpus)
2017-09-13 12:53:09 +02:00
2017-11-06 12:54:59 +01:00
"""
spacy_doc = PARSER ( " test " )
save_obj ( spacy_doc , " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/doc.pkl " )
2017-09-13 12:53:09 +02:00
2017-11-06 12:54:59 +01:00
spacy_doc2 = load_obj ( " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/doc.pkl " )
2017-09-13 12:53:09 +02:00
2017-11-06 12:54:59 +01:00
print ( " Doc: {0} " . format ( spacy_doc2 ) )
2017-09-18 16:08:11 +02:00
2017-11-06 12:54:59 +01:00
jgibbsLLDA_root = " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/ "
2017-09-13 12:53:09 +02:00
2017-11-06 12:54:59 +01:00
LLDA_filepath = " {0} labeldict.txt " . format ( jgibbsLLDA_root )
laveldict = { ' fiona ' : 10 , ' vorlagenerstellung ' : 36 , ' webserver ' : 29 , ' matrix42_hilfe ' : 18 , ' sap ' : 7 , ' pos ' : 23 , ' verwaltung ' : 4 , ' lan ' : 1 }
with open ( LLDA_filepath , ' w ' ) as file :
file . write ( json . dumps ( laveldict ) )
"""
2017-11-17 11:46:57 +01:00
2017-11-06 12:54:59 +01:00
"""
def load_corpus ( corpus_path , corpus_name , lang = " de " ) :
from pathlib import Path
2017-09-13 12:53:09 +02:00
2017-11-06 12:54:59 +01:00
# load parser
parser = spacy . load ( lang )
2017-09-13 12:53:09 +02:00
2017-11-06 12:54:59 +01:00
stringstorepath = corpus_path + str ( lang ) + ' _parser ' + ' /vocab/strings.json '
with open ( stringstorepath ) as file :
parser . vocab . strings . load ( file )
2017-09-13 12:53:09 +02:00
2017-11-06 12:54:59 +01:00
vocabpath = Path ( corpus_path + str ( lang ) + ' _parser ' + ' /vocab/lexemes.bin ' )
parser . vocab . load_lexemes ( vocabpath )
2017-09-13 12:53:09 +02:00
2017-11-06 12:54:59 +01:00
corpus = textacy . Corpus ( parser )
2017-09-13 12:53:09 +02:00
2017-11-06 12:54:59 +01:00
contentpath = corpus_path + corpus_name + " _content.bin "
metapath = corpus_path + corpus_name + " _meta.json "
2017-09-15 14:32:44 +02:00
2017-11-06 12:54:59 +01:00
metadata_stream = textacy . fileio . read_json_lines ( metapath )
spacy_docs = textacy . fileio . read_spacy_docs ( corpus . spacy_vocab , contentpath )
for spacy_doc , metadata in zip ( spacy_docs , metadata_stream ) :
corpus . add_doc (
textacy . Doc ( spacy_doc , lang = corpus . spacy_lang , metadata = metadata ) )
return corpus
"""
2017-09-15 14:32:44 +02:00
2017-11-06 12:54:59 +01:00
"""
# THESAURUS
lexicalentries = " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries_small.xml "
lexicalentries = " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml "
synsets = " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml "
2017-09-15 14:32:44 +02:00
2017-11-06 12:54:59 +01:00
def build_thesaurus ( path2lexicalentries ) : #, path2synsets):
lextree = ET . parse ( path2lexicalentries , ET . XMLParser ( encoding = " utf-8 " ) )
#syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
2017-09-18 16:08:11 +02:00
2017-11-06 12:54:59 +01:00
lexroot = lextree . getroot ( )
#synroot = syntree.getroot()
2017-09-18 16:08:11 +02:00
2017-11-06 12:54:59 +01:00
word2synsets = { }
template = { " w1 " : [ " s1 " , " s2 " ] }
2017-09-18 16:08:11 +02:00
2017-11-06 12:54:59 +01:00
for ro in lexroot :
for elem in ro :
if elem . tag == " LexicalEntry " :
lex_dictlist = [ subentry . attrib for subentry in elem ]
2017-09-18 16:08:11 +02:00
2017-11-06 12:54:59 +01:00
synlist = [ ]
string = " WORD "
2017-09-18 16:08:11 +02:00
2017-11-06 12:54:59 +01:00
for lex_dict in lex_dictlist :
if " synset " in lex_dict . keys ( ) :
2017-09-13 12:53:09 +02:00
2017-11-06 12:54:59 +01:00
synset = lex_dict [ " synset " ]
synlist . append ( synset )
2017-09-13 12:53:09 +02:00
2017-11-06 12:54:59 +01:00
if ' writtenForm ' in lex_dict . keys ( ) :
string = ( lex_dict [ " writtenForm " ] )
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
# replaceRockDots
string = re . sub ( r ' [ß] ' , " ss " , string )
string = re . sub ( r ' [ö] ' , " oe " , string )
string = re . sub ( r ' [ü] ' , " ue " , string )
string = re . sub ( r ' [ä] ' , " ae " , string )
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
# alle punkte raus
string = re . sub ( r ' [.] ' , " " , string )
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
# alles in klammern raus
string = re . sub ( r " \ ((.*) \ ) " , " " , string )
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
# längeres leerzeichen normalisieren
string = textacy . preprocess . normalize_whitespace ( string )
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
string = string . lower ( ) . strip ( )
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
word2synsets [ string ] = synlist
2017-09-15 14:32:44 +02:00
2017-11-06 12:54:59 +01:00
synset2Words = { }
template = { " s1 " : [ " w1 " , " w2 " ] }
2017-09-15 14:32:44 +02:00
2017-11-06 12:54:59 +01:00
for word , synset in word2synsets . items ( ) :
for syn in synset :
if syn not in synset2Words . keys ( ) :
synset2Words [ syn ] = [ word ]
else :
synset2Words [ syn ] . append ( word )
2017-09-15 14:32:44 +02:00
2017-11-06 12:54:59 +01:00
# nach anzhal der wörter in den strings sortieren
for synset in word2synsets . values ( ) :
synset . sort ( key = lambda x : len ( x . split ( ) ) )
2017-09-15 14:32:44 +02:00
2017-11-06 12:54:59 +01:00
thesaurus = { }
thesaurus_template = { " w1 " : " mainsyn " }
2017-09-15 14:32:44 +02:00
2017-11-06 12:54:59 +01:00
for word , synset in word2synsets . items ( ) :
try :
thesaurus [ word ] = synset2Words [ synset [ 0 ] ] [ 0 ] #Ann.: erstes synonym ist das Hauptsynonym
except :
pass
return thesaurus
2017-09-15 14:32:44 +02:00
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
for r in synroot :
for element in r :
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
if element . tag == " Synset " :
synset = [ ]
attrib = element . attrib
id = attrib [ " id " ]
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
if id not in synset2Words . keys ( ) :
synset2Words [ id ] = " WORD "
2017-09-15 14:32:44 +02:00
"""
2017-11-06 12:54:59 +01:00
"""
from postal . parser import parse_address
2017-09-15 14:32:44 +02:00
2017-11-06 12:54:59 +01:00
address = " Nicolas Rauner LS Biomaterialien und Polymerwissenschaften Fakultät Bio- und Chemieingenieurwesen TU Dortmund D-44227 Dortmund Tel: + 49-(0)231 / 755 - 3015 Fax: + 49-(0)231 / 755 - 2480 "
print ( parse_address ( address ) )
2017-09-15 14:32:44 +02:00
2017-11-06 12:54:59 +01:00
address = " Technische Universität Dortmund Maschinenbau/Lehrstuhl für Förder- und Lagerwesen LogistikCampus Joseph-von-Fraunhofer-Str. 2-4 D-44227 Dortmund "
print ( parse_address ( address ) )
2017-09-15 14:32:44 +02:00
"""
2017-11-06 12:54:59 +01:00
"""
2017-09-15 14:32:44 +02:00
2017-11-06 12:54:59 +01:00
corpus_path = " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/ "
corpus_name = " testcorpus "
2017-09-15 14:32:44 +02:00
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
#corpi.save(corpus_path, name=corpus_name, compression=corpus_compression)
#corpi = textacy.Corpus.load(corpus_path, name=corpus_name, compression=corpus_compression)
2017-09-13 12:53:09 +02:00
2017-09-12 14:56:11 +02:00
2017-11-06 12:54:59 +01:00
import pathlib
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
strings_path = pathlib . Path ( corpus_path + ' strings.json ' )
path_lexemes_bin_ = pathlib . Path ( corpus_path + ' lexemes.bin ' )
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
PARSER . vocab . dump ( path_lexemes_bin_ )
nlp . vocab . load_lexemes ( path_lexemes_bin_ )
2017-09-15 14:32:44 +02:00
2017-11-06 12:54:59 +01:00
def save_corpus ( corpus_path , corpus_name ) :
2017-09-18 16:08:11 +02:00
2017-11-06 12:54:59 +01:00
# save stringstore
stringstore_path = corpus_path + corpus_name + ' _strings.json '
with open ( stringstore_path , " w " ) as file :
PARSER . vocab . strings . dump ( file )
2017-09-18 16:08:11 +02:00
2017-11-06 12:54:59 +01:00
#save content
contentpath = corpus_path + corpus_name + " _content.bin "
textacy . fileio . write_spacy_docs ( ( doc . spacy_doc for doc in corpi ) , contentpath )
2017-09-15 14:32:44 +02:00
2017-11-06 12:54:59 +01:00
#save meta
metapath = corpus_path + corpus_name + " _meta.json "
textacy . fileio . write_json_lines ( ( doc . metadata for doc in corpi ) , metapath )
2017-09-13 12:53:09 +02:00
2017-09-11 17:29:54 +02:00
2017-11-06 12:54:59 +01:00
def load_corpus ( corpus_path , corpus_name ) :
# load new lang
nlp = spacy . load ( " de " )
2017-09-11 17:29:54 +02:00
2017-11-06 12:54:59 +01:00
#load stringstore
stringstore_path = corpus_path + corpus_name + ' _strings.json '
with open ( stringstore_path , " r " ) as file :
nlp . vocab . strings . load ( file )
2017-09-11 17:29:54 +02:00
2017-11-06 12:54:59 +01:00
# define corpi
corpi = textacy . Corpus ( nlp )
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
# load meta
metapath = corpus_path + corpus_name + " _meta.json "
metadata_stream = textacy . fileio . read_json_lines ( metapath )
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
#load content
contentpath = corpus_path + corpus_name + " _content.bin "
spacy_docs = textacy . fileio . read_spacy_docs ( corpi . spacy_vocab , contentpath )
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
for spacy_doc , metadata in zip ( spacy_docs , metadata_stream ) :
corpi . add_doc (
textacy . Doc ( spacy_doc , lang = corpi . spacy_lang , metadata = metadata ) )
2017-09-11 17:29:54 +02:00
2017-11-06 12:54:59 +01:00
return corpi
2017-09-11 17:29:54 +02:00
2017-11-06 12:54:59 +01:00
save_corpus ( corpus_path , corpus_name )
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
print ( load_corpus ( corpus_path , corpus_name ) )
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
"""
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
"""
def normalizeSynonyms ( default_return_first_Syn = False , parser = PARSER ) :
#return lambda doc : parser(" ".join([tok.lower_ for tok in doc]))
return lambda doc : parser ( " " . join ( [ getFirstSynonym ( tok . lower_ , THESAURUS , default_return_first_Syn = default_return_first_Syn ) for tok in doc ] ) )
def getFirstSynonym ( word , thesaurus , default_return_first_Syn = False ) :
if not isinstance ( word , str ) :
return str ( word )
word = word . lower ( )
# durch den thesaurrus iterieren
for syn_block in thesaurus : # syn_block ist eine liste mit Synonymen
for syn in syn_block :
syn = syn . lower ( )
if re . match ( r ' \ A[ \ w-]+ \ Z ' , syn ) : # falls syn einzelwort ist
if word == syn :
return str ( getHauptform ( syn_block , word , default_return_first_Syn = default_return_first_Syn ) )
else : # falls es ein satz ist
if word in syn :
return str ( getHauptform ( syn_block , word , default_return_first_Syn = default_return_first_Syn ) )
return str ( word ) # zur Not, das ursrpüngliche Wort zurückgeben
def getHauptform ( syn_block , word , default_return_first_Syn = False ) :
for syn in syn_block :
syn = syn . lower ( )
if " hauptform " in syn and len ( syn . split ( " " ) ) < = 2 :
# nicht ausgeben, falls es in Klammern steht#todo gibts macnmal?? klammern aus
for w in syn . split ( " " ) :
if not re . match ( r ' \ ([^)]+ \ ) ' , w ) :
return w
if default_return_first_Syn :
# falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
for w in syn_block :
if not re . match ( r ' \ ([^)]+ \ ) ' , w ) :
return w
return word # zur Not, das ursrpüngliche Wort zurückgeben
"""
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
"""
path2xml = " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/deWordNet.xml "
tree = ET . parse ( path2xml , ET . XMLParser ( encoding = " utf-8 " ) )
root = tree . getroot ( )
for r in root :
for element in r :
if element . tag == " Synset " :
attrib = element . attrib
for i , subentry in enumerate ( element ) :
if subentry . tag == " Lemma " and subentry . attrib [ " partOfSpeech " ] == " n " :
string = ( subentry . attrib [ " writtenForm " ] )
# replaceRockDots
string = re . sub ( r ' [ß] ' , " ss " , string )
string = re . sub ( r ' [ö] ' , " oe " , string )
string = re . sub ( r ' [ü] ' , " ue " , string )
string = re . sub ( r ' [ä] ' , " ae " , string )
# seperate_words_on_regex:
string = " " . join ( re . compile ( regex_specialChars ) . split ( string ) )
string_list = string . split ( )
if len ( string_list ) == 1 :
nomen . append ( string . lower ( ) . strip ( ) )
"""
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
"""
import re
from collections import Counter
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
def words ( text ) : return re . findall ( r ' \ w+ ' , text . lower ( ) )
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
WORDS = Counter ( words ( open ( ' /home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt ' ) . read ( ) ) )
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
def P ( word , N = sum ( WORDS . values ( ) ) ) :
" Probability of `word`. "
return WORDS [ word ] / N
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
def correction ( word ) :
" Most probable spelling correction for word. "
return max ( candidates ( word ) , key = P )
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
def candidates ( word ) :
" Generate possible spelling corrections for word. "
return ( known ( [ word ] ) or known ( edits1 ( word ) ) or known ( edits2 ( word ) ) or [ word ] )
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
def known ( words ) :
" The subset of `words` that appear in the dictionary of WORDS. "
return set ( w for w in words if w in WORDS )
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
def edits1 ( word ) :
" All edits that are one edit away from `word`. "
letters = ' abcdefghijklmnopqrstuvwxyz '
splits = [ ( word [ : i ] , word [ i : ] ) for i in range ( len ( word ) + 1 ) ]
deletes = [ L + R [ 1 : ] for L , R in splits if R ]
transposes = [ L + R [ 1 ] + R [ 0 ] + R [ 2 : ] for L , R in splits if len ( R ) > 1 ]
replaces = [ L + c + R [ 1 : ] for L , R in splits if R for c in letters ]
inserts = [ L + c + R for L , R in splits for c in letters ]
return set ( deletes + transposes + replaces + inserts )
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
def edits2 ( word ) :
" All edits that are two edits away from `word`. "
return ( e2 for e1 in edits1 ( word ) for e2 in edits1 ( e1 ) )
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
"""
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
"""
### extract from derewo
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
#http://www1.ids-mannheim.de/kl/projekte/methoden/derewo.html
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
raw = textacy . fileio . read_file_lines ( " DeReKo-2014-II-MainArchive-STT.100000.freq " )
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
for line in raw :
line_list = line . split ( )
if line_list [ 2 ] == " NN " :
string = line_list [ 1 ] . lower ( )
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
# replaceRockDots
string = re . sub ( r ' [ß] ' , " ss " , string )
string = re . sub ( r ' [ö] ' , " oe " , string )
string = re . sub ( r ' [ü] ' , " ue " , string )
string = re . sub ( r ' [ä] ' , " ae " , string )
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
nomen . append ( string . lower ( ) . strip ( ) )
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
textacy . fileio . write_file_lines ( nomen , " nomen2.txt " )
"""
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
"""
stream = textacy . fileio . read_csv ( " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_2017-09-13.csv " , delimiter = " ; " )
content_collumn_name = " Description "
content_collumn = 9 # standardvalue
de_tickets = [ ]
en_tickets = [ ]
misc_tickets = [ ]
error_count = 0
for i , lst in enumerate ( stream ) :
if i == 0 :
de_tickets . append ( lst )
en_tickets . append ( lst )
misc_tickets . append ( lst )
else :
try :
content_collumn_ = lst [ content_collumn ]
if detect ( content_collumn_ ) == " de " :
de_tickets . append ( lst )
elif detect ( content_collumn_ ) == " en " :
en_tickets . append ( lst )
else :
misc_tickets . append ( lst )
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
except :
misc_tickets . append ( lst )
error_count + = 1
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
print ( error_count )
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
textacy . fileio . write_csv ( de_tickets , " M42-Export/de_tickets.csv " , delimiter = " ; " )
textacy . fileio . write_csv ( en_tickets , " M42-Export/en_tickets.csv " , delimiter = " ; " )
textacy . fileio . write_csv ( misc_tickets , " M42-Export/misc_tickets.csv " , delimiter = " ; " )
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
"""
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
"""
regex_specialChars = r ' [` \ -=~!#@,.$ % ^&*()_+ \ [ \ ] {} ; \' \\ : " |</>?] '
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
def stringcleaning ( stringstream , funclist ) :
for string in stringstream :
for f in funclist :
string = f ( string )
yield string
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
def seperate_words_on_regex ( regex = regex_specialChars ) :
return lambda string : " " . join ( re . compile ( regex ) . split ( string ) )
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
words = [
" uniaccount " ,
" nr54065467 " ,
" nr54065467 " ,
" 455a33c5, "
" tvt?= " ,
" tanja.saborowski@tu-dortmund.de " ,
" - " ,
" m-sw1-vl4053.itmc.tu-dortmund.de " ,
" ------problem-------- "
]
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
topLVLFinder = re . compile ( r ' \ .[a-z] { 2,3}( \ .[a-z] { 2,3})? ' , re . IGNORECASE )
specialFinder = re . compile ( r ' [` \ -=~!@#$ % ^&*()_+ \ [ \ ] {} ; \' \\ : " |<,./>?] ' , re . IGNORECASE )
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
for s in stringcleaning ( ( w for w in words ) , [ seperate_words_on_regex ( ) ] ) :
print ( s . strip ( ) )
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
#print(stringcleaning(w,string_comp))
#print(bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?',w)))
#print(bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]',w)))
#result = specialFinder.sub(" ", w)
#print(re.sub(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]'," ",w))
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
#print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w))
"""
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
"""
def replaceRockDots ( ) :
return lambda string : re . sub ( r ' [ß] ' , " ss " , ( re . sub ( r ' [ö] ' , " oe " , ( re . sub ( r ' [ü] ' , " ue " , ( re . sub ( r ' [ä] ' , " ae " , string . lower ( ) ) ) ) ) ) ) )
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
de_stop_words = list ( textacy . fileio . read_file_lines ( filepath = " german_stopwords_full.txt " ) )
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
#blob = Text(str(textacy.fileio.read_file("teststring.txt")))#,parser=PatternParser(pprint=True, lemmata=True))
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
#print(blob.entities)
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
de_stop_words = list ( map ( replaceRockDots ( ) , de_stop_words ) )
#LEMMAS = list(map(replaceRockDots(),LEMMAS))
#VORNAMEN = list(map(replaceRockDots(),VORNAMEN))
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
de_stop_words = list ( map ( textacy . preprocess . normalize_whitespace , de_stop_words ) )
#LEMMAS = list(map(textacy.preprocess.normalize_whitespace,LEMMAS))
#VORNAMEN = list(map(textacy.preprocess.normalize_whitespace,VORNAMEN))
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
#textacy.fileio.write_file_lines(LEMMAS,"lemmas.txt")
#textacy.fileio.write_file_lines(VORNAMEN,"firstnames.txt")
textacy . fileio . write_file_lines ( de_stop_words , " german_stopwords.txt " )
2017-09-14 11:40:00 +02:00
2017-11-06 12:54:59 +01:00
"""
2017-09-14 11:40:00 +02:00