topicModelingTickets/preprocessing.py

572 lines
22 KiB
Python
Raw Normal View History

2017-08-29 15:01:17 +02:00
# -*- coding: utf-8 -*-
2017-10-10 14:42:09 +02:00
from datetime import datetime
2017-08-29 15:01:17 +02:00
import csv
2017-10-10 14:42:09 +02:00
import sys
2017-10-16 14:01:38 +02:00
from miscellaneous import *
from datetime import datetime
import time
import textacy
from scipy import *
2017-10-10 14:42:09 +02:00
2017-10-17 10:13:49 +02:00
import os
2017-10-10 14:42:09 +02:00
2017-10-17 10:13:49 +02:00
csv.field_size_limit(sys.maxsize)
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
2017-10-10 14:42:09 +02:00
2017-10-16 14:01:38 +02:00
# load config
2017-10-17 10:13:49 +02:00
config_ini = FILEPATH + "config.ini"
2017-12-08 11:06:07 +01:00
ressources_path = FILEPATH + "ressources/"
2017-10-10 14:42:09 +02:00
config = ConfigParser.ConfigParser()
2017-10-10 14:42:09 +02:00
with open(config_ini) as f:
config.read_file(f)
2017-08-31 14:54:01 +02:00
2017-10-10 14:42:09 +02:00
2017-10-18 17:37:20 +02:00
2017-10-25 09:46:44 +02:00
REGEX_SPECIALCHAR = r'[`\-=~%^&*()_+\[\]{};\'\\:"|</>]' #+r',.'
REGEX_TOPLVL = r'\.[a-z]{2,3}(\.[a-z]{2,3})?'
2017-12-08 11:06:07 +01:00
global THESAURUS
global WORDS
global LEMMAS
global NOUNS
global VORNAMEN
global DE_STOP_WORDS
global EN_STOP_WORDS
2017-10-25 09:46:44 +02:00
2017-10-16 14:01:38 +02:00
THESAURUS = {}
2017-10-18 17:37:20 +02:00
WORDS= {}
LEMMAS= {}
NOUNS= {}
VORNAMEN= {}
DE_STOP_WORDS= {}
EN_STOP_WORDS= {}
2017-08-29 15:01:17 +02:00
2017-11-06 12:54:59 +01:00
2017-10-10 14:42:09 +02:00
############# filter tokens
2017-08-31 14:54:01 +02:00
2017-11-06 12:54:59 +01:00
def filterTokens(tokens, funclist):
# in:tokenlist, funclist
# out: tokenlist
for f in funclist:
tokens = list(filter(f, tokens))
2017-11-27 12:49:05 +01:00
for tok in tokens:
if tok.pos_ =="NOUN":
x=0
2017-11-06 12:54:59 +01:00
return tokens
2017-10-10 14:42:09 +02:00
def keepPOS(pos_list):
return lambda tok: tok.pos_ in pos_list
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
def keepNouns(noun_list=NOUNS):
2017-11-27 12:49:05 +01:00
#return lambda tok: tok.lower_ in noun_list
return lambda tok: tok.lower_ in noun_list or tok.pos_ == "NOUN"
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
def removePOS(pos_list):
return lambda tok: tok.pos_ not in pos_list
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
def removeWords(words, keep=None):
if hasattr(keep, '__iter__'):
for k in keep:
try:
words.remove(k)
except ValueError:
pass
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
return lambda tok: tok.lower_ not in words
def keepENT(ent_list):
return lambda tok: tok.ent_type_ in ent_list
def removeENT(ent_list):
return lambda tok: tok.ent_type_ not in ent_list
def remove_words_containing_Numbers():
return lambda tok: not bool(re.search('\d', tok.lower_))
def remove_words_containing_topLVL():
2017-10-16 14:01:38 +02:00
return lambda tok: not bool(re.search(REGEX_TOPLVL, tok.lower_))
2017-10-10 14:42:09 +02:00
def remove_words_containing_specialCharacters():
2017-10-16 14:01:38 +02:00
return lambda tok: not bool(re.search(REGEX_SPECIALCHAR, tok.lower_))
2017-10-10 14:42:09 +02:00
def remove_long_words():
return lambda tok: not len(tok.lower_) < 2
def remove_short_words():
return lambda tok: not len(tok.lower_) > 35
def remove_first_names():
return lambda tok: tok.lower_ not in [name.lower() for name in VORNAMEN]
############# strings
def remove_addresses(string):
2017-11-06 12:54:59 +01:00
pass # todo remove_addresses idee postal.parser und zu metadaten hinzufügen
2017-10-10 14:42:09 +02:00
2017-10-16 14:01:38 +02:00
def lemmatizeWord(word,lemma_dict=LEMMAS,n=3):
2017-10-10 14:42:09 +02:00
for i in range(n):
try:
2017-10-16 14:01:38 +02:00
word = lemma_dict[word.lower()] if word.lower() in lemma_dict.keys() else word.lower()
2017-10-10 14:42:09 +02:00
except:
print(word)
return word
2017-12-08 11:06:07 +01:00
def getFirstSynonym(word, thesaurus=THESAURUS,n=3):
for i in range(n):
try:
word = thesaurus[word.lower()] if word.lower() in thesaurus.keys() else word.lower()
except:
print(word)
return word
"""
if not isinstance(word, str):
return str(word)
2017-12-08 11:06:07 +01:00
2017-08-31 14:54:01 +02:00
word = word.lower()
2017-10-16 14:01:38 +02:00
if word in thesaurus.keys():
return thesaurus[word]
else:
return str(word)
2017-12-08 11:06:07 +01:00
"""
2017-08-31 14:54:01 +02:00
2017-10-10 14:42:09 +02:00
########################## Spellchecking ##########################################
# http://norvig.com/spell-correct.html
# http://wortschatz.uni-leipzig.de/en/download
2017-08-31 14:54:01 +02:00
2017-10-10 14:42:09 +02:00
import re
2017-09-12 14:56:11 +02:00
2017-10-10 14:42:09 +02:00
def words(text): return re.findall(r'\w+', text.lower())
2017-09-12 14:56:11 +02:00
2017-10-10 14:42:09 +02:00
def P(word, N=sum(WORDS.values())):
"Probability of `word`."
return WORDS[word] / N
2017-08-31 14:54:01 +02:00
2017-10-10 14:42:09 +02:00
def correction(word):
"Most probable spelling correction for word."
return max(candidates(word), key=P)
2017-10-10 14:42:09 +02:00
def candidates(word):
"Generate possible spelling corrections for word."
return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])
2017-10-10 14:42:09 +02:00
def known(words):
"The subset of `words` that appear in the dictionary of WORDS."
return set(w for w in words if w in WORDS)
2017-08-31 14:54:01 +02:00
2017-09-11 13:24:20 +02:00
2017-10-10 14:42:09 +02:00
def edits1(word):
"All edits that are one edit away from `word`."
letters = 'abcdefghijklmnopqrstuvwxyz'
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
deletes = [L + R[1:] for L, R in splits if R]
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
inserts = [L + c + R for L, R in splits for c in letters]
return set(deletes + transposes + replaces + inserts)
2017-08-31 14:54:01 +02:00
2017-10-10 14:42:09 +02:00
def edits2(word):
"All edits that are two edits away from `word`."
return (e2 for e1 in edits1(word) for e2 in edits1(e1))
2017-08-31 14:54:01 +02:00
2017-09-11 13:24:20 +02:00
2017-10-10 14:42:09 +02:00
def autocorrectWord(word):
try:
return correction(word)
except:
return word
2017-08-31 14:54:01 +02:00
2017-10-10 14:42:09 +02:00
############# stringcleaning
2017-09-11 13:24:20 +02:00
2017-11-06 12:54:59 +01:00
def processContentstream(textstream, parser, token_filterlist=None):
2017-10-18 17:37:20 +02:00
#pre parse
textstream = preparse(textstream)
pipe = parser.pipe(textstream)
for doc in pipe:
tokens = [tok for tok in doc]
# in parse
if token_filterlist is not None:
tokens = filterTokens(tokens, token_filterlist)
# post parse
2017-11-27 12:49:05 +01:00
#todo STELLSCHRAUBE tokens = [postparse(tok) for tok in tokens] #todo: informationsverlust von pos,tag etc.!
tokens = [tok.lower_ for tok in tokens]
2017-10-18 17:37:20 +02:00
yield " ".join(tokens)
def preparse(stringstream):
for string in stringstream:
# cut_after
words = ["gruss", "grusse","gruesse","gruessen","grusses"]
for gr in words:
if gr in string:
string = string.rpartition(gr)[0]
break
yield string
def postparse(toktext):
"""
:param toktext: spacy.token
:return: string
"""
toktext = toktext.lower_
# remove_words_containing_topLVL
toktext = toktext if not re.search(REGEX_TOPLVL, toktext) else ""
2017-10-18 17:37:20 +02:00
# lemmatize
toktext = lemmatizeWord(toktext)
2017-10-18 17:37:20 +02:00
# synonyme normalisieren
toktext = getFirstSynonym(toktext)
2017-10-18 17:37:20 +02:00
# autocorrect
toktext = autocorrectWord(toktext)
return toktext
2017-10-16 14:01:38 +02:00
def processDictstream(dictstream, funcdict, parser):
2017-10-10 14:42:09 +02:00
"""
2017-10-10 14:42:09 +02:00
:param dictstream: dict-gen
:param funcdict:
clean_in_meta = {
"Solution":funclist,
...
}
2017-10-10 14:42:09 +02:00
:param parser: spacy-parser
:return: dict-gen
"""
for dic in dictstream:
result = {}
for key, value in dic.items():
2017-10-10 14:42:09 +02:00
if key in funcdict:
2017-10-10 14:42:09 +02:00
doc = parser(value)
tokens = [tok for tok in doc]
funclist = funcdict[key]
2017-10-10 14:42:09 +02:00
tokens = filterTokens(tokens, funclist)
2017-10-10 14:42:09 +02:00
result[key] = " ".join([tok.lower_ for tok in tokens])
2017-10-10 14:42:09 +02:00
else:
result[key] = value
yield result
2017-10-16 14:01:38 +02:00
##################################################################################################
2017-11-06 12:54:59 +01:00
path2thesaurus_dict = ressources_path + config.get("thesaurus","pickle_file")
2017-12-08 11:06:07 +01:00
2017-11-06 12:54:59 +01:00
path2wordsdict = ressources_path + config.get("spellchecking", "pickle_file")
2017-12-08 11:06:07 +01:00
2017-11-06 12:54:59 +01:00
path2lemmadict = ressources_path + config.get("lemmatization","pickle_file")
2017-12-08 11:06:07 +01:00
2017-11-06 12:54:59 +01:00
path2firstnameslist = ressources_path + config.get("firstnames","pickle_file")
2017-11-06 12:54:59 +01:00
path2DEstopwordlist = ressources_path + config.get("de_stopwords", "pickle_file")
2017-11-29 16:31:30 +01:00
2017-11-06 12:54:59 +01:00
path2ENstopwordlist = ressources_path + config.get("en_stopwords", "pickle_file")
2017-11-06 12:54:59 +01:00
custom_words = get_list_from_config("preprocessing", "custom_words")
2017-10-17 10:13:49 +02:00
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
2017-11-06 12:54:59 +01:00
de_plainpath = FILEPATH + config.get("de_corpus", "path") + "pre_labled_lines.txt"
2017-10-17 10:13:49 +02:00
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
2017-10-16 14:01:38 +02:00
2017-12-08 11:06:07 +01:00
def extract_from_corpus(corpus):
WHITELIST = ["boss", "sap", "firefox"] #todo autogenerierung relv. techn. begriffe
kb_cats = ['eldorado', 'cws_confluence', 'wsus', 'mail groupware', 'd.3 dms', 'serviceportal', 'softwarelizenzen', 'sophos', 'webserver', 'sap', 'ftp server', 'dhcp', 'tonerboerse', 'mailalias', 'arbeitsplatzsupport', 'mediendienste', 'mailverteiler', 'uni mail', 'basis app', 'videoschnitt', 'DEFAULT', 'verwaltung', 'matrix42_hilfe', 'hoersaaluebertragung', 'redmine', 'uniflow', 'keine rueckantwort', 'pools', 'leitung', 'netze', 'konteneinsicht', 'kennwort aenderung', 'datanet', 'neuanschluss', 'semesterticket', 'asknet', 'veranstaltungen', 'housing', 'fk 16', 'fiona', 'betrieb', 'vorlagenerstellung', 'studierendensekretariat', 'pvp', 'mobilfunkvertraege', 'ausleihe', 'web', 'spam phishing', 'sap urlaub', 'evaexam', 'vorlesungsaufzeichnung', 'firewall betreuung', 'ub basis it', 'virtuelle desktops citrix', 'fk15', 'virtuelle server', 'lizenzserver', 'elektronisches telefonbuch', 'joomla itmc website', 'weiterentwicklung', 'serversupport', 'wlan', 'kurse', 'technik', 'raumkalender', 'backup tsm', 'haustechnik', 'voicemail box', 'facility', 'unicard ausgabe', 'mdm mobile device management', 'entwicklung', 'webgestaltung', 'unicard sperrung', 'forensic', 'basis applikationen', 'overhead projektor', 'plagiatserkennung', 'uniaccount zugangsdaten', 'zentrale webserver', 'webmailer', 'fk12 webauftritt', 'plotter', 'campus management', 'ub_stoerungen', 'rundmail', 'telefon', 'raumbuchung', 'fk12 migration', 'dienstreise', 'hardware', 'it sicherheit sic', 'hochleistungsrechnen', 'unicard', 'sos', 'benutzerverwaltung_probleme', 'confluence', 'vpn', 'zhb', 'campus app', 'itmc_aufgaben', 'sicherheit', 'schulungsraum verwaltung', 'unicard produktion', 'schulung', 'video', 'dokoll support', 'sd', 'servicedesk', 'v2 campus app feedback', 'lido', 'app feedback', 'ibz raumbuchung', 'hcm stammdaten', 'itmc_stoerungen', 'boss service desk', 'exchange nutzung', 'office', 'rektorat -buero', 'bestellung', 'moodle', 'fk raumplanung 09', 'aenderung', 'neuausstattung', 'benutzerverwaltung', 'rechnerraeume', 'designentwicklung', 'fk 12', 'werkstoffe lehrstuhl bauwesen', 'server storage', 'beantragung', 'visitenkartenproduktion', 'gastaufenthalt', 'telefonkonferenzen', 'raumbuchungssysteme', 'fk14_test', 'e mail dienste', 'grafik', 'ews', 'itmc schulungsraeume', 'tsm', 'softwareverteilung', 'beamer', 'lizenzmanagement', 'fileserver einrichtung', 'redmine projektverwaltung', 'service desk itmc', 'pruefungsmanagement', 'prozess- und projektmanagement', 'formulare antraege', 'namensaenderung', 'verkauf', 'software', 'itmc medienraeume ef50', 'zugangsdaten', 'medientechnik', 'lan', 'veeam', 'unicard redaktionsteam', 'changes', 'service portal', 'limesurvey', 'dns', 'dokoll pvp', 'uhren', 'nrw ticket', 'itmc_als', 'linux bs', 'werkvertraege', 'blogs wikis foren', 'test', 'abmeldung', 'desktop & basisdienste', 'telefonzentrale', 'siport zugangskontrolle', 'antrag auf rechnungserstellung', 'verschiedene aufgaben', 'kundenserver', 'medienraeume ef50', 'videokonferenzen', 'benutzungsverwaltung', 'mailverteiler exchange', 'lsf', 'telefonabrechnung', 'werkstaette', 'uniaccount', 'outlook_einrichtung', 'itmc webauftritt', 'zertifikate server dfn', 'allgemein', 'umzug', 'service portal redaktion', 'pos', 'beschaffung', 'boss', 'hacker angriff', 'software entwicklung', 'cd dvd produktion', 'sam spider', 'viren', 'kursplanung', 'itmc pools', 'kms', 'e learning']
kb_keys = ['zugriff_onlinedienste_rueckmeldung', 'uniaccount', 'freischaltung', 'asknet', 'eduroam', 'donnerstagsmail namensaenderung', 'asiexception', 'lsf', 'kundenantwort', 'chip', 'unitymedia', 'citavi', 'fehler', 'windows beziehen', 'wlan', 'ipv6', 'freischaltung verzoegert', 'betrag', '"defekte karte"', 'risse', 'laden', 'sap portal anderer modus', 'goeke', 'informationen des itmc zum einsatz', 'transport wurde durchgefuehrt.', 'wi-fi', 'unicard_auszahlung', 'ausleihe', 'unimail', 'uni-account', 'unicard','beantragung', 'nrw-ticket', 'printservice', 'dms', 'ip6', 'transport und beschreibung zum transportauftrag !', 'wlan passwort', 'dokumentenmanagementsystem', 'webmailer', 'vpn', 'repository', 'unicard', 'projekte', 'eingeschrieben', 'unicard abholung oeffnungszeiten', 'd3', 'beantragung', 'app tu-dortmund feedback', 'semester ticket', 'redmine', 'git', 'geldkarte', 'outlook_exchange', 'spam standardmeldung phishing', 'automatische aktualisierung der selbst angelegten kontakte in outlook', '"beschaedigte unicard"', 'elektronische telefonbuch', 'boss', 'wwrite', 'DEFAULT', 'anyconnect', 'wifi']
kb_subjs =['sd_office 365 plus support', 'citavi_lizenzschluessel_nicht bekommen', 'uni card', 'sd_office 356 plus bestellung', 'sd_gastaufenthalter', 'sd_outlook kontakte automatische aktualisierung', 'benutzer zum redmine hinzufuegen', 'sd_matlab lizenzdatei pc-pools', 'sd_tu-app feedback standard', 'vpn_ipsec_stoerung', 'vpn verbindung fuer unitymedia kunden', 'ub_prod_abholung_ abholfristen_benachrichtigungen', 'einrichtung des eduroam netzwerks', 'sd_webmailer_threadanzeige und weiterleitung', 'sd_wlan passwort setzen', 'ub_prod_namenskorrektur_student', 'sd_unimail imap_pop3', 'sd_outlook_in_exchange_einbinden', 'sd_keine rueckantwort kunde', 'sd_asknet_und_dreamspark', 'sd_heirat_namensaenderung_student', 'bd_unicard_nicht_eingeschrieben', 'wlan', 'sd_telefonbuch_prof_eintragung', 'change produktiv nehmen chn00146 - transport e01k909284', 'ungueltiges ticket siehe journal', 'apps_dms_d.3 client installation/login d.3 funktioniert nicht', 'd.3 client installation', 'unicard_restbetrag_auszahlung', 'cm_asiexception', 'sd_origin_workaround', 'sd_vpn_aktualisierung', 'problem mit der beantragung von der unicard', 'sd_unicard fehlerhafte geldbuchung', 'sd_login tu portals english', 'sd_gmx_web.de', 'studierendenausweis', 'sd_citavi', 'sd_fk9 test', 'sd_webmailer_thread-anzeige', 'bd_unicard_geldkarte_laden', 'ub_unicard_unicard mit vollmacht abholen', 'sd_stellenausschreibung schwarzes brett', 'freischaltung uniaccount', 'sd_asknet_mitarbeiter_softwarebestellung', 'how to setup eduroam', 'sd_citavi bestellung', 'unicard vergessen abzuholen und nicht mehr da', 'sd_unimail zu exchange', 'sd_diensthandy beschaffung', 'sd_sap konteneinsicht antrag', 'sd_unicard_defekt', 'sd_webmailer einrichtung weiterleitung', 'sd_kurs-angebote anmeldung', 'm42_dokumentationen_zu_neuen_ous', 'sd_sap_initialkennwort', 'sd_sap_freischaltung ohne passwortaenderung', 'sd_telefonbuch-eintrag_aenderung', 'sd_pruefungsamt', 'sd_phishing', 'apps_dms-passwort d.3', 'sd_goeke drucker', 'sd_sap_dienstreise', 'unicard nochmal beantragen', 'sd_outlook anmeldung gestoert', 'sd_citavi_support', 'DEFAULT', 'sd_geraeteausleihe', 'sd_account_abmelden', 'sd_uniaccount freischaltung verzoegert englisch', 'ub_beschaedigte unicard', 'sd_gleitzeitanlage_dez3_stoerung', 'transportdurchfuehung', 'sd_sap_initialkennwort_englisch', 'sd_antwort_phishingmail', 'sd_namensaenderung mitarbeiter', 're: elektroarbeiten fuer leitsystem 2. und 3. obergeschoss', 'lsf freischaltung als mitarbeiter/in', 'ub_unicard_spaetere abholung moeglich?', 'sd_antrag funktionale mailadresse', 'sd_apple-on-campus', 'sd_office365_asknet', 'sd_sophos download', 'sd_freischaltung uniaccount verzoegert', 'ub_unicard_zusendung der karte moeglich?', 'ohne betreff', 'sd_immatrikulationsbescheinigung_portal', 'sd_studisek_buchung_semesterbeitrag', 'sd_studisek_englisch', 'probleme mit der namensaenderung/ neue unicard', 'sd_telefonbuch, neues system', 'fehlender eintrag im elektronischen telefonbuch', 'sd_boss_notenverbuchung', 'sd_laufzeit unimail account', 'sd_semesterticket', 'sd_kontakt_asknet', 'windows 10', 'sd_login_tu_portale', 'ub_geldchip-problem bei uc', 'sd_zugriff_onlinedienste_rueckmeldung', 'sd_wlan-gastkonto', 'sd_tu_app_keine internetverbindung', 'sd_uniaccount_ehemalige_passwortaenderung', 'sd_verlust/antrag unicard', 'sd_sap_konteneinsicht_ workaround', 'apps_redmine_repository', 'sd_itmc kurse anmeldebestaetigung', 'sd_mail_als_anhang', 'bd_unicard_chip_defekt', 'probleme mit unicard', 'ub_unicard_abholungszeiten', 'sd_falsche_personendaten', 'sd_uniaccount_ehemalige_studierende', 'sd_vpn anleitungen', 'sd_kurs-angebote itmc', 'sd_studisek', 'sd_login tu portale', 'sd_immatrikulationsbescheigung_druckfehler', 'ub_drucker kopierer', 'sd_vpn_temporaerer fehler ub', 'sd_spss_online_bestellung', 'sd_dreamspark', 'sd_unicard_gesperrte unicard entsperre', 'sd_boss-bescheinigung', 'bd_goeke_allgemein', 'sd_uniaccount_passwortaenderung', 'sd_namensaenderung_englisch', 'sd_email_namensaenderung', 'bd_unicard_freigabe_beantragung', 'spam ohne tu bezug', 'sd_internationaloffice', 'sd
WHITELIST = WHITELIST + kb_cats + kb_keys + kb_subjs
THESAURUS = load_obj(path2thesaurus_dict)
#WORDS = load_obj(path2wordsdict)
LEMMAS = load_obj(path2lemmadict)
DE_STOP_WORDS = load_obj(path2DEstopwordlist)
#EN_STOP_WORDS = load_obj(path2ENstopwordlist)
VORNAMEN = load_obj(path2firstnameslist)
for doc in corpus:
result = []
#if doc.metadata["TicketNumber"] == "INC40506":
# breakpoint()
for tok in doc:
if tok.lower_ =="boss" or tok.lower_ =="sap":
print(tok.lower_+": "+tok.ent_type_)
if tok.lower_ in WHITELIST:
result.append(tok.lower_)
# ignore header, urls , emails, stop, vornamen
lemmatized_word = lemmatizeWord(tok.text,lemma_dict=LEMMAS)
if lemmatized_word.lower() in ["sehr", "geehrt", "herr" ,"herrn", "herren", "dame" , "damen", "liebe","lieben", "hallo", "guten", "tag","ehre","hi"] \
or tok.like_url \
or tok.like_email \
or tok.is_stop \
or tok.is_punct \
or tok.lower_ in DE_STOP_WORDS \
or tok.lower_ in VORNAMEN:
continue
# cut after footer
if replaceRockDots(tok.lower_) in ["gruss", "grusse", "gruesse", "gruessen", "grusses"]: # fehler schneidet bei INC40506 das meiste weg
break
# boss/SAP ent_type = 'ORG' oder '' (ein-weimal LOC oder PERSON)
if tok.pos_ in ["NOUN"] \
or tok.ent_type_ in ["NORP","FACILITY","ORG","PRODUCT","WORK_OF_ART"]:
#or tok.dep_ == "ROOT":
# or tok.lower_ in NOUNS \ #,"PERSON"] \
toktext = tok.lower_
2017-12-08 11:06:07 +01:00
toktext = lemmatized_word
"""
first_synonym = getFirstSynonym(toktext, thesaurus=THESAURUS)
if first_synonym is not None:
toktext = first_synonym if len(first_synonym.split()) == 1 else toktext
"""
2017-10-16 14:01:38 +02:00
2017-12-08 11:06:07 +01:00
result.append(toktext)
yield " ".join(result)
def preprocessCorpus(corpus, clean_in_meta):
logprint("Preprocess {0}_corpus at {1}".format(corpus.lang, datetime.now()))
preCorpus_name = corpus.lang + "_pre"
clean_corpus = corpus
parser = corpus.spacy_lang
pre_corpus = textacy.Corpus(parser)
2017-10-16 14:01:38 +02:00
2017-10-17 10:13:49 +02:00
## process and add files to textacy-corpi,
2017-12-08 11:06:07 +01:00
pre_corpus.add_texts(
#processContentstream(corpus2Text(clean_corpus), token_filterlist=filter_tokens, parser=parser),
extract_from_corpus(clean_corpus),
2017-10-25 09:46:44 +02:00
processDictstream(corpus2Meta(clean_corpus), clean_in_meta,parser=parser)
2017-10-16 14:01:38 +02:00
2017-12-08 11:06:07 +01:00
)
2017-10-16 14:01:38 +02:00
2017-12-08 11:06:07 +01:00
# leere docs aus corpi kicken
pre_corpus.remove(lambda doc: len(doc) == 0)
2017-10-16 14:01:38 +02:00
2017-10-17 10:13:49 +02:00
#save corpus
2017-12-08 11:06:07 +01:00
save_corpus(corpus=pre_corpus, corpus_path=corpus_de_path, corpus_name=preCorpus_name)
2017-10-16 14:01:38 +02:00
2017-10-25 09:46:44 +02:00
#save corpus as labled, plain text
2017-12-08 11:06:07 +01:00
savelabledCorpiLines(pre_corpus, de_plainpath)
2017-10-25 09:46:44 +02:00
2017-12-08 11:06:07 +01:00
labled_lines =""
return pre_corpus
2017-10-25 09:46:44 +02:00
2017-10-16 14:01:38 +02:00
2017-12-08 11:06:07 +01:00
def main(corpus):
2017-10-17 10:13:49 +02:00
start = time.time()
2017-10-16 14:01:38 +02:00
2017-10-30 12:56:52 +01:00
2017-12-08 11:06:07 +01:00
"""
2017-10-18 17:37:20 +02:00
filter_tokens = [
keepNouns(NOUNS),
2017-10-16 14:01:38 +02:00
2017-11-27 12:49:05 +01:00
removeWords(DE_STOP_WORDS + custom_words + VORNAMEN),
2017-10-18 17:37:20 +02:00
removePOS(["PUNCT", "SPACE", "NUM"]),
2017-11-27 12:49:05 +01:00
#todo STELLSCHRAUBE remove_words_containing_Numbers(),
#todo STELLSCHRAUBE remove_long_words(),
#todo STELLSCHRAUBE remove_short_words()
2017-10-18 17:37:20 +02:00
]
2017-12-08 11:06:07 +01:00
"""
2017-10-18 17:37:20 +02:00
clean_in_meta = {
"Solution": [removePOS(["SPACE"])],
"Subject": [removePOS(["SPACE", "PUNCT"])],
"categoryName": [removePOS(["SPACE", "PUNCT"])]
}
2017-10-25 09:46:44 +02:00
2017-12-08 11:06:07 +01:00
pre_corpus = preprocessCorpus(corpus, clean_in_meta)
#for i in range(5):
# printRandomDoc(pre_corpus)
2017-10-25 09:46:44 +02:00
2017-10-16 14:01:38 +02:00
end = time.time()
2017-10-25 09:46:44 +02:00
logprint("Time Elapsed Preprocessing:{0} min".format((end - start) / 60))
2017-12-08 11:06:07 +01:00
return pre_corpus
2017-10-16 14:01:38 +02:00
if __name__ == "__main__":
2017-12-08 11:06:07 +01:00
corpus, parser = load_corpus(corpus_path="/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/",corpus_name="de_clean")
main(corpus)
2017-10-16 14:01:38 +02:00
2017-10-10 14:42:09 +02:00
"""
pipe=[
2017-10-10 14:42:09 +02:00
##String
2017-10-10 14:42:09 +02:00
fixUnicode(),
replaceHardS(),
resolveAbbrivations(),
2017-10-10 14:42:09 +02:00
remove_words_containing_topLVL(),
2017-10-10 14:42:09 +02:00
replaceSpecialChars(" "), (mit Leerzeichen erstzen, dadruch werden Terme wie 8203;verfügung getrennt
2017-10-10 14:42:09 +02:00
remove_words_containing_Numbers(),
2017-08-31 14:54:01 +02:00
2017-10-10 14:42:09 +02:00
##spacyParse
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
removeENT("PERSON"),
keepPOS(["NOUN"]),
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
#ODER
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
lemmatize(),
removeWords(de_stop_words + config.get("preprocessing","custom_words").split(",")),
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
# evtl.
spellCorrection(),
keepUniqeTokens(),
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
]
2017-08-29 15:01:17 +02:00
2017-10-10 14:42:09 +02:00
"""
2017-10-16 14:01:38 +02:00
"""
filter_tokens=[
#removeENT(["PERSON"]),
#idee addressen enfernen #bisher mit cut_after("gruss") --> postal.parser
#idee rechtschreibkorrektur --> PyEnchant
#idee thesaurus --> WordNet, eigener
2017-08-29 15:01:17 +02:00
2017-10-16 14:01:38 +02:00
remove_words_containing_Numbers(),
2017-08-29 15:01:17 +02:00
2017-10-16 14:01:38 +02:00
removePOS(["PUNCT","SPACE","NUM"]),
2017-08-29 15:01:17 +02:00
2017-10-16 14:01:38 +02:00
removeWords(de_stop_words+custom_words),
2017-08-29 15:01:17 +02:00
2017-10-16 14:01:38 +02:00
remove_long_words(),
remove_short_words(),
remove_first_names(),
2017-08-29 15:01:17 +02:00
2017-10-16 14:01:38 +02:00
keepPOS(["NOUN"]),
2017-08-29 15:01:17 +02:00
2017-10-16 14:01:38 +02:00
]
"""