2017-08-29 15:01:17 +02:00
# -*- coding: utf-8 -*-
2017-10-10 14:42:09 +02:00
from datetime import datetime
2017-08-29 15:01:17 +02:00
import csv
2017-10-10 14:42:09 +02:00
import sys
2017-10-16 14:01:38 +02:00
from miscellaneous import *
from datetime import datetime
import time
import textacy
from scipy import *
2017-10-10 14:42:09 +02:00
2017-10-17 10:13:49 +02:00
import os
2017-10-10 14:42:09 +02:00
2017-10-17 10:13:49 +02:00
csv . field_size_limit ( sys . maxsize )
FILEPATH = os . path . dirname ( os . path . realpath ( __file__ ) ) + " / "
2017-10-10 14:42:09 +02:00
2017-10-16 14:01:38 +02:00
# load config
2017-10-17 10:13:49 +02:00
config_ini = FILEPATH + " config.ini "
2017-12-08 11:06:07 +01:00
ressources_path = FILEPATH + " ressources/ "
2017-10-10 14:42:09 +02:00
2017-09-11 12:12:28 +02:00
config = ConfigParser . ConfigParser ( )
2017-10-10 14:42:09 +02:00
with open ( config_ini ) as f :
2017-09-11 12:12:28 +02:00
config . read_file ( f )
2017-08-31 14:54:01 +02:00
2017-10-10 14:42:09 +02:00
2017-10-18 17:37:20 +02:00
2017-10-25 09:46:44 +02:00
REGEX_SPECIALCHAR = r ' [` \ -=~ % ^&*()_+ \ [ \ ] {} ; \' \\ : " |</>] ' #+r',.'
REGEX_TOPLVL = r ' \ .[a-z] { 2,3}( \ .[a-z] { 2,3})? '
2017-12-08 11:06:07 +01:00
global THESAURUS
global WORDS
global LEMMAS
global NOUNS
global VORNAMEN
global DE_STOP_WORDS
global EN_STOP_WORDS
2017-10-25 09:46:44 +02:00
2017-10-16 14:01:38 +02:00
THESAURUS = { }
2017-10-18 17:37:20 +02:00
WORDS = { }
LEMMAS = { }
NOUNS = { }
VORNAMEN = { }
DE_STOP_WORDS = { }
EN_STOP_WORDS = { }
2017-08-29 15:01:17 +02:00
2017-11-06 12:54:59 +01:00
2017-10-10 14:42:09 +02:00
############# filter tokens
2017-08-31 14:54:01 +02:00
2017-11-06 12:54:59 +01:00
def filterTokens ( tokens , funclist ) :
# in:tokenlist, funclist
# out: tokenlist
for f in funclist :
tokens = list ( filter ( f , tokens ) )
2017-11-27 12:49:05 +01:00
for tok in tokens :
if tok . pos_ == " NOUN " :
x = 0
2017-11-06 12:54:59 +01:00
return tokens
2017-10-10 14:42:09 +02:00
def keepPOS ( pos_list ) :
return lambda tok : tok . pos_ in pos_list
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
def keepNouns ( noun_list = NOUNS ) :
2017-11-27 12:49:05 +01:00
#return lambda tok: tok.lower_ in noun_list
return lambda tok : tok . lower_ in noun_list or tok . pos_ == " NOUN "
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
def removePOS ( pos_list ) :
return lambda tok : tok . pos_ not in pos_list
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
def removeWords ( words , keep = None ) :
2017-09-11 12:12:28 +02:00
if hasattr ( keep , ' __iter__ ' ) :
for k in keep :
try :
words . remove ( k )
except ValueError :
pass
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
return lambda tok : tok . lower_ not in words
def keepENT ( ent_list ) :
return lambda tok : tok . ent_type_ in ent_list
def removeENT ( ent_list ) :
return lambda tok : tok . ent_type_ not in ent_list
def remove_words_containing_Numbers ( ) :
return lambda tok : not bool ( re . search ( ' \ d ' , tok . lower_ ) )
def remove_words_containing_topLVL ( ) :
2017-10-16 14:01:38 +02:00
return lambda tok : not bool ( re . search ( REGEX_TOPLVL , tok . lower_ ) )
2017-10-10 14:42:09 +02:00
def remove_words_containing_specialCharacters ( ) :
2017-10-16 14:01:38 +02:00
return lambda tok : not bool ( re . search ( REGEX_SPECIALCHAR , tok . lower_ ) )
2017-10-10 14:42:09 +02:00
def remove_long_words ( ) :
return lambda tok : not len ( tok . lower_ ) < 2
def remove_short_words ( ) :
return lambda tok : not len ( tok . lower_ ) > 35
def remove_first_names ( ) :
return lambda tok : tok . lower_ not in [ name . lower ( ) for name in VORNAMEN ]
############# strings
def remove_addresses ( string ) :
2017-11-06 12:54:59 +01:00
pass # todo remove_addresses idee postal.parser und zu metadaten hinzufügen
2017-10-10 14:42:09 +02:00
2017-12-11 12:10:40 +01:00
def lemmatizeWord ( word , lemma_dict = LEMMAS , n = 5 ) :
2017-10-10 14:42:09 +02:00
for i in range ( n ) :
try :
2017-10-16 14:01:38 +02:00
word = lemma_dict [ word . lower ( ) ] if word . lower ( ) in lemma_dict . keys ( ) else word . lower ( )
2017-10-10 14:42:09 +02:00
except :
print ( word )
return word
2017-12-11 12:10:40 +01:00
def getFirstSynonym ( word , thesaurus = THESAURUS , n = 3 ) :
2017-12-08 11:06:07 +01:00
for i in range ( n ) :
2017-12-11 12:10:40 +01:00
2017-12-08 11:06:07 +01:00
try :
2017-12-11 12:10:40 +01:00
if word in thesaurus . keys ( ) :
return thesaurus [ word ]
2017-12-08 11:06:07 +01:00
2017-12-11 12:10:40 +01:00
elif word . title ( ) in thesaurus . keys ( ) :
return thesaurus [ word . title ( ) ]
2017-12-08 11:06:07 +01:00
2017-12-11 12:10:40 +01:00
elif word . lower ( ) in thesaurus . keys ( ) :
return thesaurus [ word . lower ( ) ]
else :
return word
except :
print ( " THESAURUSFEHLER BEI: {} " . format ( word ) )
return word
2017-08-31 14:54:01 +02:00
2017-10-10 14:42:09 +02:00
########################## Spellchecking ##########################################
# http://norvig.com/spell-correct.html
# http://wortschatz.uni-leipzig.de/en/download
2017-08-31 14:54:01 +02:00
2017-10-10 14:42:09 +02:00
import re
2017-09-12 14:56:11 +02:00
2017-10-10 14:42:09 +02:00
def words ( text ) : return re . findall ( r ' \ w+ ' , text . lower ( ) )
2017-09-12 14:56:11 +02:00
2017-10-10 14:42:09 +02:00
def P ( word , N = sum ( WORDS . values ( ) ) ) :
" Probability of `word`. "
return WORDS [ word ] / N
2017-08-31 14:54:01 +02:00
2017-09-11 12:12:28 +02:00
2017-10-10 14:42:09 +02:00
def correction ( word ) :
" Most probable spelling correction for word. "
return max ( candidates ( word ) , key = P )
2017-09-11 12:12:28 +02:00
2017-10-10 14:42:09 +02:00
def candidates ( word ) :
" Generate possible spelling corrections for word. "
return ( known ( [ word ] ) or known ( edits1 ( word ) ) or known ( edits2 ( word ) ) or [ word ] )
2017-09-11 12:12:28 +02:00
2017-10-10 14:42:09 +02:00
def known ( words ) :
" The subset of `words` that appear in the dictionary of WORDS. "
return set ( w for w in words if w in WORDS )
2017-08-31 14:54:01 +02:00
2017-09-11 13:24:20 +02:00
2017-10-10 14:42:09 +02:00
def edits1 ( word ) :
" All edits that are one edit away from `word`. "
letters = ' abcdefghijklmnopqrstuvwxyz '
splits = [ ( word [ : i ] , word [ i : ] ) for i in range ( len ( word ) + 1 ) ]
deletes = [ L + R [ 1 : ] for L , R in splits if R ]
transposes = [ L + R [ 1 ] + R [ 0 ] + R [ 2 : ] for L , R in splits if len ( R ) > 1 ]
replaces = [ L + c + R [ 1 : ] for L , R in splits if R for c in letters ]
inserts = [ L + c + R for L , R in splits for c in letters ]
return set ( deletes + transposes + replaces + inserts )
2017-08-31 14:54:01 +02:00
2017-10-10 14:42:09 +02:00
def edits2 ( word ) :
" All edits that are two edits away from `word`. "
return ( e2 for e1 in edits1 ( word ) for e2 in edits1 ( e1 ) )
2017-08-31 14:54:01 +02:00
2017-09-11 13:24:20 +02:00
2017-10-10 14:42:09 +02:00
def autocorrectWord ( word ) :
try :
return correction ( word )
except :
return word
2017-08-31 14:54:01 +02:00
2017-10-10 14:42:09 +02:00
############# stringcleaning
2017-09-11 13:24:20 +02:00
2017-11-06 12:54:59 +01:00
def processContentstream ( textstream , parser , token_filterlist = None ) :
2017-10-18 17:37:20 +02:00
#pre parse
textstream = preparse ( textstream )
pipe = parser . pipe ( textstream )
for doc in pipe :
tokens = [ tok for tok in doc ]
# in parse
if token_filterlist is not None :
tokens = filterTokens ( tokens , token_filterlist )
# post parse
2017-11-27 12:49:05 +01:00
#todo STELLSCHRAUBE tokens = [postparse(tok) for tok in tokens] #todo: informationsverlust von pos,tag etc.!
tokens = [ tok . lower_ for tok in tokens ]
2017-10-18 17:37:20 +02:00
yield " " . join ( tokens )
def preparse ( stringstream ) :
for string in stringstream :
# cut_after
words = [ " gruss " , " grusse " , " gruesse " , " gruessen " , " grusses " ]
for gr in words :
if gr in string :
string = string . rpartition ( gr ) [ 0 ]
break
yield string
def postparse ( toktext ) :
"""
: param toktext : spacy . token
: return : string
"""
toktext = toktext . lower_
# remove_words_containing_topLVL
toktext = toktext if not re . search ( REGEX_TOPLVL , toktext ) else " "
2017-09-11 13:00:03 +02:00
2017-10-18 17:37:20 +02:00
# lemmatize
toktext = lemmatizeWord ( toktext )
2017-09-11 13:00:03 +02:00
2017-10-18 17:37:20 +02:00
# synonyme normalisieren
toktext = getFirstSynonym ( toktext )
2017-09-11 13:00:03 +02:00
2017-10-18 17:37:20 +02:00
# autocorrect
toktext = autocorrectWord ( toktext )
return toktext
2017-09-11 13:00:03 +02:00
2017-10-16 14:01:38 +02:00
def processDictstream ( dictstream , funcdict , parser ) :
2017-10-10 14:42:09 +02:00
"""
2017-09-11 13:00:03 +02:00
2017-10-10 14:42:09 +02:00
: param dictstream : dict - gen
: param funcdict :
clean_in_meta = {
" Solution " : funclist ,
. . .
}
2017-09-11 13:00:03 +02:00
2017-10-10 14:42:09 +02:00
: param parser : spacy - parser
: return : dict - gen
"""
for dic in dictstream :
result = { }
for key , value in dic . items ( ) :
2017-09-11 13:00:03 +02:00
2017-10-10 14:42:09 +02:00
if key in funcdict :
2017-09-11 13:00:03 +02:00
2017-10-10 14:42:09 +02:00
doc = parser ( value )
tokens = [ tok for tok in doc ]
funclist = funcdict [ key ]
2017-09-11 13:00:03 +02:00
2017-10-10 14:42:09 +02:00
tokens = filterTokens ( tokens , funclist )
2017-09-11 13:00:03 +02:00
2017-10-10 14:42:09 +02:00
result [ key ] = " " . join ( [ tok . lower_ for tok in tokens ] )
2017-09-11 13:00:03 +02:00
2017-10-10 14:42:09 +02:00
else :
result [ key ] = value
yield result
2017-09-11 13:00:03 +02:00
2017-10-16 14:01:38 +02:00
##################################################################################################
2017-09-11 13:00:03 +02:00
2017-10-12 15:57:56 +02:00
2017-11-06 12:54:59 +01:00
path2thesaurus_dict = ressources_path + config . get ( " thesaurus " , " pickle_file " )
2017-12-08 11:06:07 +01:00
2017-11-06 12:54:59 +01:00
path2wordsdict = ressources_path + config . get ( " spellchecking " , " pickle_file " )
2017-12-08 11:06:07 +01:00
2017-11-06 12:54:59 +01:00
path2lemmadict = ressources_path + config . get ( " lemmatization " , " pickle_file " )
2017-12-08 11:06:07 +01:00
2017-11-06 12:54:59 +01:00
path2firstnameslist = ressources_path + config . get ( " firstnames " , " pickle_file " )
2017-10-12 15:57:56 +02:00
2017-11-06 12:54:59 +01:00
path2DEstopwordlist = ressources_path + config . get ( " de_stopwords " , " pickle_file " )
2017-11-29 16:31:30 +01:00
2017-11-06 12:54:59 +01:00
path2ENstopwordlist = ressources_path + config . get ( " en_stopwords " , " pickle_file " )
2017-10-12 15:57:56 +02:00
2017-11-06 12:54:59 +01:00
custom_words = get_list_from_config ( " preprocessing " , " custom_words " )
2017-10-12 15:57:56 +02:00
2017-10-17 10:13:49 +02:00
corpus_de_path = FILEPATH + config . get ( " de_corpus " , " path " )
2017-11-06 12:54:59 +01:00
de_plainpath = FILEPATH + config . get ( " de_corpus " , " path " ) + " pre_labled_lines.txt "
2017-09-11 13:00:03 +02:00
2017-10-17 10:13:49 +02:00
corpus_en_path = FILEPATH + config . get ( " en_corpus " , " path " )
2017-10-16 14:01:38 +02:00
2017-12-08 11:06:07 +01:00
def extract_from_corpus ( corpus ) :
2017-12-11 12:10:40 +01:00
"""
Extract from each doc from a corpus a string containing disired token_texts
: param corpus : textacy . Corpus
: return : string - gen
"""
# WHITELIST erstellen. Enthält zumindest die evtuellen Topics
2017-12-08 11:06:07 +01:00
WHITELIST = [ " boss " , " sap " , " firefox " ] #todo autogenerierung relv. techn. begriffe
kb_cats = [ ' eldorado ' , ' cws_confluence ' , ' wsus ' , ' mail groupware ' , ' d.3 dms ' , ' serviceportal ' , ' softwarelizenzen ' , ' sophos ' , ' webserver ' , ' sap ' , ' ftp server ' , ' dhcp ' , ' tonerboerse ' , ' mailalias ' , ' arbeitsplatzsupport ' , ' mediendienste ' , ' mailverteiler ' , ' uni mail ' , ' basis app ' , ' videoschnitt ' , ' DEFAULT ' , ' verwaltung ' , ' matrix42_hilfe ' , ' hoersaaluebertragung ' , ' redmine ' , ' uniflow ' , ' keine rueckantwort ' , ' pools ' , ' leitung ' , ' netze ' , ' konteneinsicht ' , ' kennwort aenderung ' , ' datanet ' , ' neuanschluss ' , ' semesterticket ' , ' asknet ' , ' veranstaltungen ' , ' housing ' , ' fk 16 ' , ' fiona ' , ' betrieb ' , ' vorlagenerstellung ' , ' studierendensekretariat ' , ' pvp ' , ' mobilfunkvertraege ' , ' ausleihe ' , ' web ' , ' spam phishing ' , ' sap urlaub ' , ' evaexam ' , ' vorlesungsaufzeichnung ' , ' firewall betreuung ' , ' ub basis it ' , ' virtuelle desktops citrix ' , ' fk15 ' , ' virtuelle server ' , ' lizenzserver ' , ' elektronisches telefonbuch ' , ' joomla itmc website ' , ' weiterentwicklung ' , ' serversupport ' , ' wlan ' , ' kurse ' , ' technik ' , ' raumkalender ' , ' backup tsm ' , ' haustechnik ' , ' voicemail box ' , ' facility ' , ' unicard ausgabe ' , ' mdm mobile device management ' , ' entwicklung ' , ' webgestaltung ' , ' unicard sperrung ' , ' forensic ' , ' basis applikationen ' , ' overhead projektor ' , ' plagiatserkennung ' , ' uniaccount zugangsdaten ' , ' zentrale webserver ' , ' webmailer ' , ' fk12 webauftritt ' , ' plotter ' , ' campus management ' , ' ub_stoerungen ' , ' rundmail ' , ' telefon ' , ' raumbuchung ' , ' fk12 migration ' , ' dienstreise ' , ' hardware ' , ' it sicherheit sic ' , ' hochleistungsrechnen ' , ' unicard ' , ' sos ' , ' benutzerverwaltung_probleme ' , ' confluence ' , ' vpn ' , ' zhb ' , ' campus app ' , ' itmc_aufgaben ' , ' sicherheit ' , ' schulungsraum verwaltung ' , ' unicard produktion ' , ' schulung ' , ' video ' , ' dokoll support ' , ' sd ' , ' servicedesk ' , ' v2 campus app feedback ' , ' lido ' , ' app feedback ' , ' ibz raumbuchung ' , ' hcm stammdaten ' , ' itmc_stoerungen ' , ' boss service desk ' , ' exchange nutzung ' , ' office ' , ' rektorat -buero ' , ' bestellung ' , ' moodle ' , ' fk raumplanung 09 ' , ' aenderung ' , ' neuausstattung ' , ' benutzerverwaltung ' , ' rechnerraeume ' , ' designentwicklung ' , ' fk 12 ' , ' werkstoffe lehrstuhl bauwesen ' , ' server storage ' , ' beantragung ' , ' visitenkartenproduktion ' , ' gastaufenthalt ' , ' telefonkonferenzen ' , ' raumbuchungssysteme ' , ' fk14_test ' , ' e mail dienste ' , ' grafik ' , ' ews ' , ' itmc schulungsraeume ' , ' tsm ' , ' softwareverteilung ' , ' beamer ' , ' lizenzmanagement ' , ' fileserver einrichtung ' , ' redmine projektverwaltung ' , ' service desk itmc ' , ' pruefungsmanagement ' , ' prozess- und projektmanagement ' , ' formulare antraege ' , ' namensaenderung ' , ' verkauf ' , ' software ' , ' itmc medienraeume ef50 ' , ' zugangsdaten ' , ' medientechnik ' , ' lan ' , ' veeam ' , ' unicard redaktionsteam ' , ' changes ' , ' service portal ' , ' limesurvey ' , ' dns ' , ' dokoll pvp ' , ' uhren ' , ' nrw ticket ' , ' itmc_als ' , ' linux bs ' , ' werkvertraege ' , ' blogs wikis foren ' , ' test ' , ' abmeldung ' , ' desktop & basisdienste ' , ' telefonzentrale ' , ' siport zugangskontrolle ' , ' antrag auf rechnungserstellung ' , ' verschiedene aufgaben ' , ' kundenserver ' , ' medienraeume ef50 ' , ' videokonferenzen ' , ' benutzungsverwaltung ' , ' mailverteiler exchange ' , ' lsf ' , ' telefonabrechnung ' , ' werkstaette ' , ' uniaccount ' , ' outlook_einrichtung ' , ' itmc webauftritt ' , ' zertifikate server dfn ' , ' allgemein ' , ' umzug ' , ' service portal redaktion ' , ' pos ' , ' beschaffung ' , ' boss ' , ' hacker angriff ' , ' software entwicklung ' , ' cd dvd produktion ' , ' sam spider ' , ' viren ' , ' kursplanung ' , ' itmc pools ' , ' kms ' , ' e learning ' ]
kb_keys = [ ' zugriff_onlinedienste_rueckmeldung ' , ' uniaccount ' , ' freischaltung ' , ' asknet ' , ' eduroam ' , ' donnerstagsmail namensaenderung ' , ' asiexception ' , ' lsf ' , ' kundenantwort ' , ' chip ' , ' unitymedia ' , ' citavi ' , ' fehler ' , ' windows beziehen ' , ' wlan ' , ' ipv6 ' , ' freischaltung verzoegert ' , ' betrag ' , ' " defekte karte " ' , ' risse ' , ' laden ' , ' sap portal anderer modus ' , ' goeke ' , ' informationen des itmc zum einsatz ' , ' transport wurde durchgefuehrt. ' , ' wi-fi ' , ' unicard_auszahlung ' , ' ausleihe ' , ' unimail ' , ' uni-account ' , ' unicard ' , ' beantragung ' , ' nrw-ticket ' , ' printservice ' , ' dms ' , ' ip6 ' , ' transport und beschreibung zum transportauftrag ! ' , ' wlan passwort ' , ' dokumentenmanagementsystem ' , ' webmailer ' , ' vpn ' , ' repository ' , ' unicard ' , ' projekte ' , ' eingeschrieben ' , ' unicard abholung oeffnungszeiten ' , ' d3 ' , ' beantragung ' , ' app tu-dortmund feedback ' , ' semester ticket ' , ' redmine ' , ' git ' , ' geldkarte ' , ' outlook_exchange ' , ' spam standardmeldung phishing ' , ' automatische aktualisierung der selbst angelegten kontakte in outlook ' , ' " beschaedigte unicard " ' , ' elektronische telefonbuch ' , ' boss ' , ' wwrite ' , ' DEFAULT ' , ' anyconnect ' , ' wifi ' ]
kb_subjs = [ ' sd_office 365 plus support ' , ' citavi_lizenzschluessel_nicht bekommen ' , ' uni card ' , ' sd_office 356 plus bestellung ' , ' sd_gastaufenthalter ' , ' sd_outlook kontakte automatische aktualisierung ' , ' benutzer zum redmine hinzufuegen ' , ' sd_matlab lizenzdatei pc-pools ' , ' sd_tu-app feedback standard ' , ' vpn_ipsec_stoerung ' , ' vpn verbindung fuer unitymedia kunden ' , ' ub_prod_abholung_ abholfristen_benachrichtigungen ' , ' einrichtung des eduroam netzwerks ' , ' sd_webmailer_threadanzeige und weiterleitung ' , ' sd_wlan passwort setzen ' , ' ub_prod_namenskorrektur_student ' , ' sd_unimail imap_pop3 ' , ' sd_outlook_in_exchange_einbinden ' , ' sd_keine rueckantwort kunde ' , ' sd_asknet_und_dreamspark ' , ' sd_heirat_namensaenderung_student ' , ' bd_unicard_nicht_eingeschrieben ' , ' wlan ' , ' sd_telefonbuch_prof_eintragung ' , ' change produktiv nehmen chn00146 - transport e01k909284 ' , ' ungueltiges ticket siehe journal ' , ' apps_dms_d.3 client installation/login d.3 funktioniert nicht ' , ' d.3 client installation ' , ' unicard_restbetrag_auszahlung ' , ' cm_asiexception ' , ' sd_origin_workaround ' , ' sd_vpn_aktualisierung ' , ' problem mit der beantragung von der unicard ' , ' sd_unicard fehlerhafte geldbuchung ' , ' sd_login tu portals english ' , ' sd_gmx_web.de ' , ' studierendenausweis ' , ' sd_citavi ' , ' sd_fk9 test ' , ' sd_webmailer_thread-anzeige ' , ' bd_unicard_geldkarte_laden ' , ' ub_unicard_unicard mit vollmacht abholen ' , ' sd_stellenausschreibung schwarzes brett ' , ' freischaltung uniaccount ' , ' sd_asknet_mitarbeiter_softwarebestellung ' , ' how to setup eduroam ' , ' sd_citavi bestellung ' , ' unicard vergessen abzuholen und nicht mehr da ' , ' sd_unimail zu exchange ' , ' sd_diensthandy beschaffung ' , ' sd_sap konteneinsicht antrag ' , ' sd_unicard_defekt ' , ' sd_webmailer einrichtung weiterleitung ' , ' sd_kurs-angebote anmeldung ' , ' m42_dokumentationen_zu_neuen_ous ' , ' sd_sap_initialkennwort ' , ' sd_sap_freischaltung ohne passwortaenderung ' , ' sd_telefonbuch-eintrag_aenderung ' , ' sd_pruefungsamt ' , ' sd_phishing ' , ' apps_dms-passwort d.3 ' , ' sd_goeke drucker ' , ' sd_sap_dienstreise ' , ' unicard nochmal beantragen ' , ' sd_outlook anmeldung gestoert ' , ' sd_citavi_support ' , ' DEFAULT ' , ' sd_geraeteausleihe ' , ' sd_account_abmelden ' , ' sd_uniaccount freischaltung verzoegert englisch ' , ' ub_beschaedigte unicard ' , ' sd_gleitzeitanlage_dez3_stoerung ' , ' transportdurchfuehung ' , ' sd_sap_initialkennwort_englisch ' , ' sd_antwort_phishingmail ' , ' sd_namensaenderung mitarbeiter ' , ' re: elektroarbeiten fuer leitsystem 2. und 3. obergeschoss ' , ' lsf freischaltung als mitarbeiter/in ' , ' ub_unicard_spaetere abholung moeglich? ' , ' sd_antrag funktionale mailadresse ' , ' sd_apple-on-campus ' , ' sd_office365_asknet ' , ' sd_sophos download ' , ' sd_freischaltung uniaccount verzoegert ' , ' ub_unicard_zusendung der karte moeglich? ' , ' ohne betreff ' , ' sd_immatrikulationsbescheinigung_portal ' , ' sd_studisek_buchung_semesterbeitrag ' , ' sd_studisek_englisch ' , ' probleme mit der namensaenderung/ neue unicard ' , ' sd_telefonbuch, neues system ' , ' fehlender eintrag im elektronischen telefonbuch ' , ' sd_boss_notenverbuchung ' , ' sd_laufzeit unimail account ' , ' sd_semesterticket ' , ' sd_kontakt_asknet ' , ' windows 10 ' , ' sd_login_tu_portale ' , ' ub_geldchip-problem bei uc ' , ' sd_zugriff_onlinedienste_rueckmeldung ' , ' sd_wlan-gastkonto ' , ' sd_tu_app_keine internetverbindung ' , ' sd_uniaccount_ehemalige_passwortaenderung ' , ' sd_verlust/antrag unicard ' , ' sd_sap_konteneinsicht_ workaround ' , ' apps_redmine_repository ' , ' sd_itmc kurse anmeldebestaetigung ' , ' sd_mail_als_anhang ' , ' bd_unicard_chip_defekt ' , ' probleme mit unicard ' , ' ub_unicard_abholungszeiten ' , ' sd_falsche_personendaten ' , ' sd_uniaccount_ehemalige_studierende ' , ' sd_vpn anleitungen ' , ' sd_kurs-angebote itmc ' , ' sd_studisek ' , ' sd_login tu portale ' , ' sd_immatrikulationsbescheigung_druckfehler ' , ' ub_drucker kopierer ' , ' sd_vpn_temporaerer fehler ub ' , ' sd_spss_online_bestellung ' , ' sd_dreamspark ' , ' sd_unicard_gesperrte unicard entsperre ' , ' sd_boss-bescheinigung ' , ' bd_goeke_allgemein ' , ' sd_uniaccount_passwortaenderung ' , ' sd_namensaenderung_englisch ' , ' sd_email_namensaenderung ' , ' bd_unicard_freigabe_beantragung ' , ' spam ohne tu bezug ' , ' sd_internationaloffice ' , ' sd
WHITELIST = WHITELIST + kb_cats + kb_keys + kb_subjs
2017-12-11 12:10:40 +01:00
2017-12-08 11:06:07 +01:00
THESAURUS = load_obj ( path2thesaurus_dict )
#WORDS = load_obj(path2wordsdict)
LEMMAS = load_obj ( path2lemmadict )
DE_STOP_WORDS = load_obj ( path2DEstopwordlist )
#EN_STOP_WORDS = load_obj(path2ENstopwordlist)
VORNAMEN = load_obj ( path2firstnameslist )
2017-12-11 12:10:40 +01:00
ents_boss = [ ]
ents_sap = [ ]
2017-12-08 11:06:07 +01:00
for doc in corpus :
result = [ ]
#if doc.metadata["TicketNumber"] == "INC40506":
# breakpoint()
for tok in doc :
2017-12-11 12:10:40 +01:00
"""
if tok . lower_ == " boss " :
ents_boss . append ( tok . ent_type_ )
2017-12-08 11:06:07 +01:00
2017-12-11 12:10:40 +01:00
if tok . lower_ == " sap " :
ents_sap . append ( tok . ent_type_ )
"""
# wenn in whitelist, direkt übernehmen
2017-12-08 11:06:07 +01:00
if tok . lower_ in WHITELIST :
result . append ( tok . lower_ )
# ignore header, urls , emails, stop, vornamen
lemmatized_word = lemmatizeWord ( tok . text , lemma_dict = LEMMAS )
if lemmatized_word . lower ( ) in [ " sehr " , " geehrt " , " herr " , " herrn " , " herren " , " dame " , " damen " , " liebe " , " lieben " , " hallo " , " guten " , " tag " , " ehre " , " hi " ] \
or tok . like_url \
or tok . like_email \
or tok . is_stop \
or tok . is_punct \
or tok . lower_ in DE_STOP_WORDS \
or tok . lower_ in VORNAMEN :
continue
2017-12-11 12:10:40 +01:00
2017-12-08 11:06:07 +01:00
# cut after footer
2017-12-11 12:10:40 +01:00
if replaceRockDots ( tok . lower_ ) in [ " gruss " , " grusse " , " gruesse " , " gruessen " , " grusses " ] : # fehler schneidet bei zB INC40506 das meiste weg
2017-12-08 11:06:07 +01:00
break
if tok . pos_ in [ " NOUN " ] \
2017-12-11 12:10:40 +01:00
or tok . ent_type_ in [ " NORP " , " FACILITY " , " ORG " , " PRODUCT " , " WORK_OF_ART " , " LOC " ] :
2017-12-08 11:06:07 +01:00
#or tok.dep_ == "ROOT":
# or tok.lower_ in NOUNS \ #,"PERSON"] \
toktext = tok . lower_
2017-09-11 13:00:03 +02:00
2017-12-08 11:06:07 +01:00
toktext = lemmatized_word
2017-12-11 12:10:40 +01:00
# hauptsynonym bilden idee zwar das Huaptsyn bilden und zählen aber die originalen wörter in den llda algo geben
"""
2017-12-08 11:06:07 +01:00
first_synonym = getFirstSynonym ( toktext , thesaurus = THESAURUS )
2017-12-11 12:10:40 +01:00
if first_synonym is not None or first_synonym != ' ' :
2017-12-08 11:06:07 +01:00
toktext = first_synonym if len ( first_synonym . split ( ) ) == 1 else toktext
"""
2017-10-16 14:01:38 +02:00
2017-12-08 11:06:07 +01:00
result . append ( toktext )
yield " " . join ( result )
2017-12-11 12:10:40 +01:00
"""
print ( list ( set ( ents_sap ) ) )
[ ' ' , ' ORG ' , ' PERSON ' , ' LOC ' ]
print ( list ( set ( ents_boss ) ) )
[ ' ' , ' ORG ' , ' PERSON ' , ' LOC ' ]
"""
2017-12-08 11:06:07 +01:00
def preprocessCorpus ( corpus , clean_in_meta ) :
logprint ( " Preprocess {0} _corpus at {1} " . format ( corpus . lang , datetime . now ( ) ) )
preCorpus_name = corpus . lang + " _pre "
clean_corpus = corpus
parser = corpus . spacy_lang
pre_corpus = textacy . Corpus ( parser )
2017-10-16 14:01:38 +02:00
2017-10-17 10:13:49 +02:00
## process and add files to textacy-corpi,
2017-12-08 11:06:07 +01:00
pre_corpus . add_texts (
#processContentstream(corpus2Text(clean_corpus), token_filterlist=filter_tokens, parser=parser),
extract_from_corpus ( clean_corpus ) ,
2017-10-25 09:46:44 +02:00
processDictstream ( corpus2Meta ( clean_corpus ) , clean_in_meta , parser = parser )
2017-10-16 14:01:38 +02:00
2017-12-08 11:06:07 +01:00
)
2017-10-16 14:01:38 +02:00
2017-12-11 12:10:40 +01:00
# idee labeled_lines.txt enthählt bigramme mit unterstrich
# todo preCorpus weg. llda bekommt labaled_lines.txt und lda doctermamtrix
2017-10-16 14:01:38 +02:00
2017-12-08 11:06:07 +01:00
# leere docs aus corpi kicken
pre_corpus . remove ( lambda doc : len ( doc ) == 0 )
2017-10-16 14:01:38 +02:00
2017-10-17 10:13:49 +02:00
#save corpus
2017-12-08 11:06:07 +01:00
save_corpus ( corpus = pre_corpus , corpus_path = corpus_de_path , corpus_name = preCorpus_name )
2017-10-16 14:01:38 +02:00
2017-10-25 09:46:44 +02:00
#save corpus as labled, plain text
2017-12-08 11:06:07 +01:00
savelabledCorpiLines ( pre_corpus , de_plainpath )
2017-10-25 09:46:44 +02:00
2017-12-08 11:06:07 +01:00
labled_lines = " "
return pre_corpus
2017-10-25 09:46:44 +02:00
2017-10-16 14:01:38 +02:00
2017-12-08 11:06:07 +01:00
def main ( corpus ) :
2017-10-17 10:13:49 +02:00
start = time . time ( )
2017-10-16 14:01:38 +02:00
2017-10-30 12:56:52 +01:00
2017-12-08 11:06:07 +01:00
"""
2017-10-18 17:37:20 +02:00
filter_tokens = [
keepNouns ( NOUNS ) ,
2017-10-16 14:01:38 +02:00
2017-11-27 12:49:05 +01:00
removeWords ( DE_STOP_WORDS + custom_words + VORNAMEN ) ,
2017-10-18 17:37:20 +02:00
removePOS ( [ " PUNCT " , " SPACE " , " NUM " ] ) ,
2017-11-27 12:49:05 +01:00
#todo STELLSCHRAUBE remove_words_containing_Numbers(),
#todo STELLSCHRAUBE remove_long_words(),
#todo STELLSCHRAUBE remove_short_words()
2017-10-18 17:37:20 +02:00
]
2017-12-08 11:06:07 +01:00
"""
2017-10-18 17:37:20 +02:00
clean_in_meta = {
" Solution " : [ removePOS ( [ " SPACE " ] ) ] ,
" Subject " : [ removePOS ( [ " SPACE " , " PUNCT " ] ) ] ,
" categoryName " : [ removePOS ( [ " SPACE " , " PUNCT " ] ) ]
}
2017-10-25 09:46:44 +02:00
2017-12-08 11:06:07 +01:00
pre_corpus = preprocessCorpus ( corpus , clean_in_meta )
#for i in range(5):
# printRandomDoc(pre_corpus)
2017-10-25 09:46:44 +02:00
2017-10-16 14:01:38 +02:00
end = time . time ( )
2017-10-25 09:46:44 +02:00
logprint ( " Time Elapsed Preprocessing: {0} min " . format ( ( end - start ) / 60 ) )
2017-12-08 11:06:07 +01:00
return pre_corpus
2017-10-16 14:01:38 +02:00
if __name__ == " __main__ " :
2017-12-08 11:06:07 +01:00
corpus , parser = load_corpus ( corpus_path = " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/ " , corpus_name = " de_clean " )
main ( corpus )
2017-10-16 14:01:38 +02:00
2017-10-10 14:42:09 +02:00
"""
pipe = [
2017-09-11 13:00:03 +02:00
2017-10-10 14:42:09 +02:00
##String
2017-09-11 13:00:03 +02:00
2017-10-10 14:42:09 +02:00
fixUnicode ( ) ,
replaceHardS ( ) ,
resolveAbbrivations ( ) ,
2017-09-11 13:00:03 +02:00
2017-10-10 14:42:09 +02:00
remove_words_containing_topLVL ( ) ,
2017-09-11 13:00:03 +02:00
2017-10-10 14:42:09 +02:00
replaceSpecialChars ( " " ) , ( mit Leerzeichen erstzen , dadruch werden Terme wie 8203 ; verfügung getrennt
2017-09-11 13:00:03 +02:00
2017-10-10 14:42:09 +02:00
remove_words_containing_Numbers ( ) ,
2017-08-31 14:54:01 +02:00
2017-10-10 14:42:09 +02:00
##spacyParse
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
removeENT ( " PERSON " ) ,
keepPOS ( [ " NOUN " ] ) ,
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
#ODER
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
lemmatize ( ) ,
removeWords ( de_stop_words + config . get ( " preprocessing " , " custom_words " ) . split ( " , " ) ) ,
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
# evtl.
spellCorrection ( ) ,
keepUniqeTokens ( ) ,
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
]
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
"""
2017-10-16 14:01:38 +02:00
"""
filter_tokens = [
#removeENT(["PERSON"]),
#idee addressen enfernen #bisher mit cut_after("gruss") --> postal.parser
#idee rechtschreibkorrektur --> PyEnchant
#idee thesaurus --> WordNet, eigener
2017-08-29 15:01:17 +02:00
2017-10-16 14:01:38 +02:00
remove_words_containing_Numbers ( ) ,
2017-08-29 15:01:17 +02:00
2017-10-16 14:01:38 +02:00
removePOS ( [ " PUNCT " , " SPACE " , " NUM " ] ) ,
2017-08-29 15:01:17 +02:00
2017-10-16 14:01:38 +02:00
removeWords ( de_stop_words + custom_words ) ,
2017-08-29 15:01:17 +02:00
2017-10-16 14:01:38 +02:00
remove_long_words ( ) ,
remove_short_words ( ) ,
remove_first_names ( ) ,
2017-08-29 15:01:17 +02:00
2017-10-16 14:01:38 +02:00
keepPOS ( [ " NOUN " ] ) ,
2017-08-29 15:01:17 +02:00
2017-10-16 14:01:38 +02:00
]
"""