2017-09-19 14:42:38 +02:00
# -*- coding: utf-8 -*-
2017-10-17 10:13:49 +02:00
2017-09-19 14:42:38 +02:00
import re
2017-09-25 13:12:23 +02:00
import time
2017-10-09 12:50:34 +02:00
import json
2017-09-20 15:22:13 +02:00
2017-10-17 10:13:49 +02:00
#import spacy
#import textacy
2017-10-12 15:57:56 +02:00
from functools import reduce
2017-09-19 14:42:38 +02:00
2017-10-18 17:37:20 +02:00
import textacy
2017-09-25 13:12:23 +02:00
start = time . time ( )
2017-10-02 14:31:33 +02:00
import enchant
2017-09-25 13:12:23 +02:00
from datetime import datetime
2017-10-25 09:46:44 +02:00
import os
2017-09-25 13:12:23 +02:00
import xml . etree . ElementTree as ET
2017-10-25 09:46:44 +02:00
FILEPATH = os . path . dirname ( os . path . realpath ( __file__ ) ) + " / "
from miscellaneous import *
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModeling.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_topicModeling.log &"
parser = spacy . load ( " de " )
2017-09-25 13:12:23 +02:00
2017-10-10 14:42:09 +02:00
"""
2017-10-25 09:46:44 +02:00
# load config
config_ini = FILEPATH + " config.ini "
config = ConfigParser . ConfigParser ( )
with open ( config_ini ) as f :
config . read_file ( f )
2017-10-09 12:50:34 +02:00
PARSER = spacy . load ( " de " )
2017-10-16 14:01:38 +02:00
corpi = textacy . Corpus ( PARSER )
2017-10-09 12:50:34 +02:00
testcontetn = [
" fdsfdsfsd " ,
" juzdtjlkö " ,
" gfadojplk "
]
testmetda = [
{ " categoryName " : " zhb " , " Solution " : " " , " Subject " : " schulungstest " } ,
{ " categoryName " : " neuanschluss " , " Solution " : " subject " , " Subject " : " telephone contract " } ,
{ " categoryName " : " zhb " , " Solution " : " " , " Subject " : " setuji " }
]
def makecontent ( testcontetn ) :
for content in testcontetn :
yield content
def makemeta ( testmetda ) :
for metdata in testmetda :
yield metdata
2017-10-25 09:46:44 +02:00
def corpus2Text ( corpus ) :
for doc in corpus :
yield doc . text
2017-10-16 14:01:38 +02:00
corpi . add_texts (
2017-10-09 12:50:34 +02:00
makecontent ( testcontetn ) ,
makemeta ( testmetda )
)
2017-10-25 09:46:44 +02:00
corpus_de_path = " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/test/ "
rawCorpus_name = " de_test_ticket "
2017-10-16 14:01:38 +02:00
print ( corpi )
2017-10-25 09:46:44 +02:00
#save_corpusV2(corpi,corpus_path=corpus_de_path,corpus_name=rawCorpus_name)
#textacy.fileio.write_file_lines(corpus2Text(corpi), filepath=corpus_de_path+"plain.txt")
dict = { " unicard redaktionsteam " : 189 , " kms " : 131 , " itmc_st \u00f6 rungen " : 17 , " benutzerverwaltung_probleme " : 168 , " mailverteiler exchange " : 130 , " beamer " : 70 , " cws_confluence " : 190 , " benutzerverwaltung " : 26 , " sos " : 166 , " virtuelle server " : 116 , " sap " : 7 , " wlan " : 21 , " lsf " : 6 , " gastaufenthalt " : 8 , " umzug " : 5 , " firewall betreuung " : 129 , " ausleihe " : 39 , " fiona " : 10 , " kursplanung " : 195 , " schulungsraum verwaltung " : 200 , " plagiatserkennung " : 32 , " designentwicklung " : 100 , " ub basis it " : 184 , " tsm " : 51 , " backup tsm " : 110 , " raumkalender " : 174 , " veeam " : 149 , " linux bs " : 42 , " hochleistungsrechnen " : 90 , " e learning " : 37 , " h \u00f6 rsaal \u00fc bertragung " : 52 , " sophos " : 88 , " service portal redaktion " : 182 , " verkauf " : 93 , " fk 16 " : 30 , " campus app " : 54 , " dns " : 71 , " kurse " : 196 , " itmc schulungsr \u00e4 ume " : 96 , " leitung " : 91 , " telefon " : 14 , " housing " : 135 , " softwarelizenzen " : 35 , " hcm stammdaten " : 68 , " semesterticket " : 197 , " exchange nutzung " : 33 , " mediendienste " : 167 , " sam spider " : 172 , " pvp " : 27 , " webserver " : 29 , " werkvertr \u00e4 ge " : 158 , " ibz raumbuchung " : 177 , " webmailer " : 126 , " unicard sperrung " : 64 , " cd dvd produktion " : 114 , " lizenzserver " : 92 , " pr \u00fc fungsmanagement " : 38 , " blogs wikis foren " : 87 , " unicard ausgabe " : 161 , " pools " : 157 , " desktop & basisdienste " : 144 , " antrag auf rechnungserstellung " : 193 , " mailalias " : 121 , " evaexam " : 133 , " neuanschluss " : 0 , " mobilfunkvertr \u00e4 ge " : 69 , " ftp server " : 191 , " haustechnik " : 77 , " raumbuchungssysteme " : 186 , " confluence " : 181 , " uniaccount zugangsdaten " : 47 , " itmc medienr \u00e4 ume ef50 " : 171 , " dokoll support " : 128 , " elektronisches telefonbuch " : 3 , " softwareverteilung " : 153 , " overhead projektor " : 104 , " sicherheit " : 145 , " itmc_als " : 48 , " itmc pools " : 160 , " zhb " : 60 , " serversupport " : 101 , " veranstaltungen " : 61 , " fk12 webauftritt " : 138 , " hardware " : 142 , " unicard produktion " : 156 , " telefonkonferenzen " : 170 , " dhcp " : 188 , " zertifikate server dfn " : 139 , " lan " : 1 , " datanet " : 49 , " neuausstattung " : 173 , " moodle " : 16 , " abmeldung " : 13 , " uni mail " : 15 , " medienr \u00e4 ume ef50 " : 117 , " verschiedene aufgaben " : 40 , " zentrale webserver " : 75 , " vorlesungsaufzeichnung " : 152 , " grafik " : 132 , " campus management " : 72 , " hacker angriff " : 46 , " pos " : 23 , " zugangsdaten " : 41 , " serviceportal " : 63 , " ews " : 24 , " voicemail box " : 150 , " service desk itmc " : 74 , " test " : 180 , " beschaffung " : 57 , " bestellung " : 185 , " vpn " : 55 , " app feedback " : 66 , " allgemein " : 134 , " rundmail " : 105 , " telefonabrechnung " : 199 , " limesurvey " : 31 , " unicard " : 28 , " eldorado " : 140 , " uniaccount " : 12 , " plotter " : 125 , " mdm mobile device management " : 120 , " namens \u00e4 nderung " : 43 , " sd " : 84 , " basis applikationen " : 103 , " \u00e4 nderung " : 194 , " fileserver einrichtung " : 187 , " fk14_test " : 154 , " werkst \u00e4 tte " : 179 , " itmc_aufgaben " : 45 , " formulare antr \u00e4 ge " : 81 , " facility " : 192 , " web " : 169 , " asknet " : 136 , " server storage " : 113 , " mail groupware " : 20 , " rektorat -b \u00fc ro " : 178 , " office " : 50 , " werkstoffe lehrstuhl bauwesen " : 59 , " telefonzentrale " : 115 , " verwaltung " : 4 , " netze " : 22 , " beantragung " : 82 , " d.3 dms " : 148 , " redmine projektverwaltung " : 141 , " wsus " : 106 , " lido " : 118 , " rechnerr \u00e4 ume " : 143 , " matrix42_hilfe " : 18 , " boss service desk " : 44 , " konteneinsicht " : 62 , " spam phishing " : 53 , " forensic " : 164 , " fk 12 " : 11 , " benutzungsverwaltung " : 198 , " redmine " : 79 , " basis app " : 85 , " viren " : 95 , " fk12 migration " : 155 , " raumbuchung " : 109 , " virtuelle desktops citrix " : 176 , " outlook_einrichtung " : 123 , " kundenserver " : 137 , " nrw ticket " : 80 , " weiterentwicklung " : 127 , " siport zugangskontrolle " : 98 , " e mail dienste " : 99 , " vorlagenerstellung " : 36 , " video " : 19 , " studierendensekretariat " : 111 , " it sicherheit sic " : 86 , " boss " : 25 , " technik " : 58 , " dokoll pvp " : 112 , " betrieb " : 2 , " v2 campus app feedback " : 151 , " mailverteiler " : 108 , " videoschnitt " : 119 , " fk raumplanung 09 " : 9 , " sap urlaub " : 73 , " keine r \u00fc ckantwort " : 124 , " prozess- und projektmanagement " : 67 , " dienstreise " : 34 , " webgestaltung " : 78 , " schulung " : 175 , " software " : 89 , " medientechnik " : 76 , " servicedesk " : 107 , " service portal " : 94 , " software entwicklung " : 165 , " uniflow " : 159 , " ub_st \u00f6 rungen " : 162 , " fk15 " : 183 , " uhren " :
list = [ ( key , value ) for key , value in dict . items ( ) ]
list . sort ( key = lambda tup : tup [ 1 ] )
"""
"""
from spacy . tokens . doc import Doc as SpacyDoc
filepath = " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/de_clean_ticket_content.bin "
# load parser
parser = spacy . load ( " de " )
corpus_path = " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/ "
stringstorepath = corpus_path + ' de_parser/vocab/strings.json '
with open ( stringstorepath ) as file :
parser . vocab . strings . load ( file )
vocabpath = Path ( corpus_path + ' de_parser/vocab/lexemes.bin ' )
parser . vocab . load_lexemes ( vocabpath )
spacy_vocab = parser . vocab
def readCorpus ( filepath ) :
with open_sesame ( filepath , mode = ' rb ' ) as f :
for bytes_string in SpacyDoc . read_bytes ( f ) :
yield SpacyDoc ( spacy_vocab ) . from_bytes ( bytes_string ) . text
textacy . fileio . write_file_lines ( readCorpus ( filepath ) , " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/result.txt " )
2017-10-10 14:42:09 +02:00
"""
2017-10-12 15:57:56 +02:00
2017-10-25 09:46:44 +02:00
# load raw corpus and create new one
#raw_corpus, parser = load_corpusV2(corpus_name=rawCorpus_name, corpus_path=corpus_de_path)
#printRandomDoc(raw_corpus)
"""
spacy_doc = PARSER ( " test " )
save_obj ( spacy_doc , " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/doc.pkl " )
spacy_doc2 = load_obj ( " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/doc.pkl " )
print ( " Doc: {0} " . format ( spacy_doc2 ) )
2017-10-18 17:37:20 +02:00
jgibbsLLDA_root = " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/ "
2017-10-12 15:57:56 +02:00
2017-10-18 17:37:20 +02:00
LLDA_filepath = " {0} labeldict.txt " . format ( jgibbsLLDA_root )
laveldict = { ' fiona ' : 10 , ' vorlagenerstellung ' : 36 , ' webserver ' : 29 , ' matrix42_hilfe ' : 18 , ' sap ' : 7 , ' pos ' : 23 , ' verwaltung ' : 4 , ' lan ' : 1 }
with open ( LLDA_filepath , ' w ' ) as file :
file . write ( json . dumps ( laveldict ) )
2017-10-17 10:13:49 +02:00
"""
2017-10-25 09:46:44 +02:00
"""
2017-10-16 14:01:38 +02:00
def load_corpus ( corpus_path , corpus_name , lang = " de " ) :
from pathlib import Path
# load parser
parser = spacy . load ( lang )
stringstorepath = corpus_path + str ( lang ) + ' _parser ' + ' /vocab/strings.json '
with open ( stringstorepath ) as file :
parser . vocab . strings . load ( file )
vocabpath = Path ( corpus_path + str ( lang ) + ' _parser ' + ' /vocab/lexemes.bin ' )
parser . vocab . load_lexemes ( vocabpath )
corpus = textacy . Corpus ( parser )
contentpath = corpus_path + corpus_name + " _content.bin "
metapath = corpus_path + corpus_name + " _meta.json "
metadata_stream = textacy . fileio . read_json_lines ( metapath )
spacy_docs = textacy . fileio . read_spacy_docs ( corpus . spacy_vocab , contentpath )
for spacy_doc , metadata in zip ( spacy_docs , metadata_stream ) :
corpus . add_doc (
textacy . Doc ( spacy_doc , lang = corpus . spacy_lang , metadata = metadata ) )
return corpus
2017-10-17 10:13:49 +02:00
"""
2017-10-16 14:01:38 +02:00
2017-10-17 10:13:49 +02:00
"""
2017-10-12 15:57:56 +02:00
# THESAURUS
lexicalentries = " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries_small.xml "
lexicalentries = " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml "
synsets = " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml "
def build_thesaurus ( path2lexicalentries ) : #, path2synsets):
lextree = ET . parse ( path2lexicalentries , ET . XMLParser ( encoding = " utf-8 " ) )
#syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
lexroot = lextree . getroot ( )
#synroot = syntree.getroot()
word2synsets = { }
template = { " w1 " : [ " s1 " , " s2 " ] }
for ro in lexroot :
for elem in ro :
if elem . tag == " LexicalEntry " :
lex_dictlist = [ subentry . attrib for subentry in elem ]
synlist = [ ]
string = " WORD "
for lex_dict in lex_dictlist :
if " synset " in lex_dict . keys ( ) :
synset = lex_dict [ " synset " ]
synlist . append ( synset )
if ' writtenForm ' in lex_dict . keys ( ) :
string = ( lex_dict [ " writtenForm " ] )
# replaceRockDots
string = re . sub ( r ' [ß] ' , " ss " , string )
string = re . sub ( r ' [ö] ' , " oe " , string )
string = re . sub ( r ' [ü] ' , " ue " , string )
string = re . sub ( r ' [ä] ' , " ae " , string )
2017-10-11 17:16:04 +02:00
2017-10-12 15:57:56 +02:00
# alle punkte raus
string = re . sub ( r ' [.] ' , " " , string )
2017-10-11 17:16:04 +02:00
2017-10-12 15:57:56 +02:00
# alles in klammern raus
string = re . sub ( r " \ ((.*) \ ) " , " " , string )
2017-10-11 17:16:04 +02:00
2017-10-12 15:57:56 +02:00
# längeres leerzeichen normalisieren
string = textacy . preprocess . normalize_whitespace ( string )
2017-10-11 17:16:04 +02:00
2017-10-12 15:57:56 +02:00
string = string . lower ( ) . strip ( )
2017-10-11 17:16:04 +02:00
2017-10-12 15:57:56 +02:00
word2synsets [ string ] = synlist
2017-10-11 17:16:04 +02:00
2017-10-12 15:57:56 +02:00
synset2Words = { }
template = { " s1 " : [ " w1 " , " w2 " ] }
2017-10-11 17:16:04 +02:00
2017-10-12 15:57:56 +02:00
for word , synset in word2synsets . items ( ) :
for syn in synset :
if syn not in synset2Words . keys ( ) :
synset2Words [ syn ] = [ word ]
else :
synset2Words [ syn ] . append ( word )
2017-10-11 17:16:04 +02:00
2017-10-12 15:57:56 +02:00
# nach anzhal der wörter in den strings sortieren
for synset in word2synsets . values ( ) :
synset . sort ( key = lambda x : len ( x . split ( ) ) )
2017-10-11 17:16:04 +02:00
2017-10-12 15:57:56 +02:00
thesaurus = { }
thesaurus_template = { " w1 " : " mainsyn " }
2017-10-11 17:16:04 +02:00
2017-10-12 15:57:56 +02:00
for word , synset in word2synsets . items ( ) :
try :
thesaurus [ word ] = synset2Words [ synset [ 0 ] ] [ 0 ] #Ann.: erstes synonym ist das Hauptsynonym
except :
pass
return thesaurus
2017-10-11 17:16:04 +02:00
2017-10-17 10:13:49 +02:00
2017-10-12 15:57:56 +02:00
for r in synroot :
for element in r :
2017-10-11 17:16:04 +02:00
2017-10-12 15:57:56 +02:00
if element . tag == " Synset " :
synset = [ ]
attrib = element . attrib
id = attrib [ " id " ]
2017-10-11 17:16:04 +02:00
2017-10-12 15:57:56 +02:00
if id not in synset2Words . keys ( ) :
synset2Words [ id ] = " WORD "
2017-10-17 10:13:49 +02:00
"""
2017-10-11 17:16:04 +02:00
"""
2017-10-10 14:42:09 +02:00
from postal . parser import parse_address
address = " Nicolas Rauner LS Biomaterialien und Polymerwissenschaften Fakultät Bio- und Chemieingenieurwesen TU Dortmund D-44227 Dortmund Tel: + 49-(0)231 / 755 - 3015 Fax: + 49-(0)231 / 755 - 2480 "
print ( parse_address ( address ) )
address = " Technische Universität Dortmund Maschinenbau/Lehrstuhl für Förder- und Lagerwesen LogistikCampus Joseph-von-Fraunhofer-Str. 2-4 D-44227 Dortmund "
print ( parse_address ( address ) )
2017-10-11 17:16:04 +02:00
"""
2017-10-09 12:50:34 +02:00
2017-10-10 14:42:09 +02:00
"""
2017-10-16 14:01:38 +02:00
corpus_path = " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/ "
2017-10-09 12:50:34 +02:00
corpus_name = " testcorpus "
2017-10-10 14:42:09 +02:00
2017-10-16 14:01:38 +02:00
#corpi.save(corpus_path, name=corpus_name, compression=corpus_compression)
#corpi = textacy.Corpus.load(corpus_path, name=corpus_name, compression=corpus_compression)
2017-10-09 12:50:34 +02:00
import pathlib
strings_path = pathlib . Path ( corpus_path + ' strings.json ' )
path_lexemes_bin_ = pathlib . Path ( corpus_path + ' lexemes.bin ' )
PARSER . vocab . dump ( path_lexemes_bin_ )
nlp . vocab . load_lexemes ( path_lexemes_bin_ )
2017-10-10 14:42:09 +02:00
2017-10-09 12:50:34 +02:00
def save_corpus ( corpus_path , corpus_name ) :
# save stringstore
stringstore_path = corpus_path + corpus_name + ' _strings.json '
with open ( stringstore_path , " w " ) as file :
PARSER . vocab . strings . dump ( file )
#save content
contentpath = corpus_path + corpus_name + " _content.bin "
2017-10-16 14:01:38 +02:00
textacy . fileio . write_spacy_docs ( ( doc . spacy_doc for doc in corpi ) , contentpath )
2017-10-09 12:50:34 +02:00
#save meta
metapath = corpus_path + corpus_name + " _meta.json "
2017-10-16 14:01:38 +02:00
textacy . fileio . write_json_lines ( ( doc . metadata for doc in corpi ) , metapath )
2017-10-09 12:50:34 +02:00
def load_corpus ( corpus_path , corpus_name ) :
# load new lang
nlp = spacy . load ( " de " )
#load stringstore
stringstore_path = corpus_path + corpus_name + ' _strings.json '
with open ( stringstore_path , " r " ) as file :
nlp . vocab . strings . load ( file )
2017-10-16 14:01:38 +02:00
# define corpi
corpi = textacy . Corpus ( nlp )
2017-10-09 12:50:34 +02:00
# load meta
metapath = corpus_path + corpus_name + " _meta.json "
metadata_stream = textacy . fileio . read_json_lines ( metapath )
#load content
contentpath = corpus_path + corpus_name + " _content.bin "
2017-10-16 14:01:38 +02:00
spacy_docs = textacy . fileio . read_spacy_docs ( corpi . spacy_vocab , contentpath )
2017-10-09 12:50:34 +02:00
for spacy_doc , metadata in zip ( spacy_docs , metadata_stream ) :
2017-10-16 14:01:38 +02:00
corpi . add_doc (
textacy . Doc ( spacy_doc , lang = corpi . spacy_lang , metadata = metadata ) )
2017-10-09 12:50:34 +02:00
2017-10-16 14:01:38 +02:00
return corpi
2017-10-09 12:50:34 +02:00
save_corpus ( corpus_path , corpus_name )
print ( load_corpus ( corpus_path , corpus_name ) )
2017-10-10 14:42:09 +02:00
"""
2017-09-25 13:12:23 +02:00
2017-09-28 12:42:05 +02:00
"""
def normalizeSynonyms ( default_return_first_Syn = False , parser = PARSER ) :
#return lambda doc : parser(" ".join([tok.lower_ for tok in doc]))
return lambda doc : parser ( " " . join ( [ getFirstSynonym ( tok . lower_ , THESAURUS , default_return_first_Syn = default_return_first_Syn ) for tok in doc ] ) )
def getFirstSynonym ( word , thesaurus , default_return_first_Syn = False ) :
if not isinstance ( word , str ) :
return str ( word )
word = word . lower ( )
# durch den thesaurrus iterieren
for syn_block in thesaurus : # syn_block ist eine liste mit Synonymen
for syn in syn_block :
syn = syn . lower ( )
if re . match ( r ' \ A[ \ w-]+ \ Z ' , syn ) : # falls syn einzelwort ist
if word == syn :
return str ( getHauptform ( syn_block , word , default_return_first_Syn = default_return_first_Syn ) )
else : # falls es ein satz ist
if word in syn :
return str ( getHauptform ( syn_block , word , default_return_first_Syn = default_return_first_Syn ) )
return str ( word ) # zur Not, das ursrpüngliche Wort zurückgeben
def getHauptform ( syn_block , word , default_return_first_Syn = False ) :
for syn in syn_block :
syn = syn . lower ( )
if " hauptform " in syn and len ( syn . split ( " " ) ) < = 2 :
# nicht ausgeben, falls es in Klammern steht#todo gibts macnmal?? klammern aus
for w in syn . split ( " " ) :
if not re . match ( r ' \ ([^)]+ \ ) ' , w ) :
return w
if default_return_first_Syn :
# falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
for w in syn_block :
if not re . match ( r ' \ ([^)]+ \ ) ' , w ) :
return w
return word # zur Not, das ursrpüngliche Wort zurückgeben
"""
2017-09-25 13:12:23 +02:00
2017-09-28 12:42:05 +02:00
"""
path2xml = " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/deWordNet.xml "
tree = ET . parse ( path2xml , ET . XMLParser ( encoding = " utf-8 " ) )
root = tree . getroot ( )
2017-09-26 11:03:09 +02:00
for r in root :
for element in r :
2017-09-28 12:42:05 +02:00
if element . tag == " Synset " :
attrib = element . attrib
2017-09-26 11:03:09 +02:00
for i , subentry in enumerate ( element ) :
if subentry . tag == " Lemma " and subentry . attrib [ " partOfSpeech " ] == " n " :
string = ( subentry . attrib [ " writtenForm " ] )
# replaceRockDots
string = re . sub ( r ' [ß] ' , " ss " , string )
string = re . sub ( r ' [ö] ' , " oe " , string )
string = re . sub ( r ' [ü] ' , " ue " , string )
string = re . sub ( r ' [ä] ' , " ae " , string )
# seperate_words_on_regex:
string = " " . join ( re . compile ( regex_specialChars ) . split ( string ) )
string_list = string . split ( )
if len ( string_list ) == 1 :
nomen . append ( string . lower ( ) . strip ( ) )
2017-09-28 12:42:05 +02:00
"""
2017-10-09 12:50:34 +02:00
"""
2017-10-02 14:31:33 +02:00
import re
from collections import Counter
2017-09-25 13:12:23 +02:00
2017-10-02 14:31:33 +02:00
def words ( text ) : return re . findall ( r ' \ w+ ' , text . lower ( ) )
2017-09-25 13:12:23 +02:00
2017-10-02 14:31:33 +02:00
WORDS = Counter ( words ( open ( ' /home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt ' ) . read ( ) ) )
2017-09-25 13:12:23 +02:00
2017-10-02 14:31:33 +02:00
def P ( word , N = sum ( WORDS . values ( ) ) ) :
" Probability of `word`. "
return WORDS [ word ] / N
2017-09-25 13:12:23 +02:00
2017-10-02 14:31:33 +02:00
def correction ( word ) :
" Most probable spelling correction for word. "
return max ( candidates ( word ) , key = P )
2017-09-25 13:12:23 +02:00
2017-10-02 14:31:33 +02:00
def candidates ( word ) :
" Generate possible spelling corrections for word. "
return ( known ( [ word ] ) or known ( edits1 ( word ) ) or known ( edits2 ( word ) ) or [ word ] )
2017-09-25 13:12:23 +02:00
2017-10-02 14:31:33 +02:00
def known ( words ) :
" The subset of `words` that appear in the dictionary of WORDS. "
return set ( w for w in words if w in WORDS )
2017-09-25 13:12:23 +02:00
2017-10-02 14:31:33 +02:00
def edits1 ( word ) :
" All edits that are one edit away from `word`. "
letters = ' abcdefghijklmnopqrstuvwxyz '
splits = [ ( word [ : i ] , word [ i : ] ) for i in range ( len ( word ) + 1 ) ]
deletes = [ L + R [ 1 : ] for L , R in splits if R ]
transposes = [ L + R [ 1 ] + R [ 0 ] + R [ 2 : ] for L , R in splits if len ( R ) > 1 ]
replaces = [ L + c + R [ 1 : ] for L , R in splits if R for c in letters ]
inserts = [ L + c + R for L , R in splits for c in letters ]
return set ( deletes + transposes + replaces + inserts )
2017-09-25 13:12:23 +02:00
2017-10-02 14:31:33 +02:00
def edits2 ( word ) :
" All edits that are two edits away from `word`. "
return ( e2 for e1 in edits1 ( word ) for e2 in edits1 ( e1 ) )
2017-09-25 13:12:23 +02:00
2017-10-09 12:50:34 +02:00
"""
2017-09-25 13:12:23 +02:00
"""
2017-09-26 11:03:09 +02:00
### extract from derewo
2017-09-25 13:12:23 +02:00
2017-09-26 11:03:09 +02:00
#http://www1.ids-mannheim.de/kl/projekte/methoden/derewo.html
2017-09-25 13:12:23 +02:00
2017-09-26 11:03:09 +02:00
raw = textacy . fileio . read_file_lines ( " DeReKo-2014-II-MainArchive-STT.100000.freq " )
2017-09-25 13:12:23 +02:00
2017-09-26 11:03:09 +02:00
for line in raw :
line_list = line . split ( )
if line_list [ 2 ] == " NN " :
string = line_list [ 1 ] . lower ( )
2017-09-25 13:12:23 +02:00
2017-09-26 11:03:09 +02:00
# replaceRockDots
string = re . sub ( r ' [ß] ' , " ss " , string )
string = re . sub ( r ' [ö] ' , " oe " , string )
string = re . sub ( r ' [ü] ' , " ue " , string )
string = re . sub ( r ' [ä] ' , " ae " , string )
2017-09-25 13:12:23 +02:00
2017-09-26 11:03:09 +02:00
nomen . append ( string . lower ( ) . strip ( ) )
textacy . fileio . write_file_lines ( nomen , " nomen2.txt " )
2017-09-25 13:12:23 +02:00
"""
"""
2017-09-21 12:05:32 +02:00
stream = textacy . fileio . read_csv ( " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_2017-09-13.csv " , delimiter = " ; " )
content_collumn_name = " Description "
content_collumn = 9 # standardvalue
de_tickets = [ ]
en_tickets = [ ]
misc_tickets = [ ]
error_count = 0
for i , lst in enumerate ( stream ) :
if i == 0 :
de_tickets . append ( lst )
en_tickets . append ( lst )
misc_tickets . append ( lst )
else :
try :
content_collumn_ = lst [ content_collumn ]
if detect ( content_collumn_ ) == " de " :
de_tickets . append ( lst )
elif detect ( content_collumn_ ) == " en " :
en_tickets . append ( lst )
else :
misc_tickets . append ( lst )
except :
misc_tickets . append ( lst )
error_count + = 1
print ( error_count )
textacy . fileio . write_csv ( de_tickets , " M42-Export/de_tickets.csv " , delimiter = " ; " )
textacy . fileio . write_csv ( en_tickets , " M42-Export/en_tickets.csv " , delimiter = " ; " )
textacy . fileio . write_csv ( misc_tickets , " M42-Export/misc_tickets.csv " , delimiter = " ; " )
2017-09-19 14:42:38 +02:00
2017-09-25 13:12:23 +02:00
"""
2017-09-19 14:42:38 +02:00
"""
regex_specialChars = r ' [` \ -=~!#@,.$ % ^&*()_+ \ [ \ ] {} ; \' \\ : " |</>?] '
def stringcleaning ( stringstream , funclist ) :
for string in stringstream :
for f in funclist :
string = f ( string )
yield string
def seperate_words_on_regex ( regex = regex_specialChars ) :
return lambda string : " " . join ( re . compile ( regex ) . split ( string ) )
words = [
" uniaccount " ,
" nr54065467 " ,
" nr54065467 " ,
" 455a33c5, "
" tvt?= " ,
" tanja.saborowski@tu-dortmund.de " ,
" - " ,
" m-sw1-vl4053.itmc.tu-dortmund.de " ,
" ------problem-------- "
]
topLVLFinder = re . compile ( r ' \ .[a-z] { 2,3}( \ .[a-z] { 2,3})? ' , re . IGNORECASE )
specialFinder = re . compile ( r ' [` \ -=~!@#$ % ^&*()_+ \ [ \ ] {} ; \' \\ : " |<,./>?] ' , re . IGNORECASE )
for s in stringcleaning ( ( w for w in words ) , [ seperate_words_on_regex ( ) ] ) :
print ( s . strip ( ) )
#print(stringcleaning(w,string_comp))
#print(bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?',w)))
#print(bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]',w)))
#result = specialFinder.sub(" ", w)
#print(re.sub(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]'," ",w))
#print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w))
2017-09-20 15:22:13 +02:00
"""
"""
2017-09-21 12:05:32 +02:00
def replaceRockDots ( ) :
return lambda string : re . sub ( r ' [ß] ' , " ss " , ( re . sub ( r ' [ö] ' , " oe " , ( re . sub ( r ' [ü] ' , " ue " , ( re . sub ( r ' [ä] ' , " ae " , string . lower ( ) ) ) ) ) ) ) )
2017-09-20 15:22:13 +02:00
2017-09-21 12:05:32 +02:00
de_stop_words = list ( textacy . fileio . read_file_lines ( filepath = " german_stopwords_full.txt " ) )
2017-09-20 15:22:13 +02:00
#blob = Text(str(textacy.fileio.read_file("teststring.txt")))#,parser=PatternParser(pprint=True, lemmata=True))
#print(blob.entities)
de_stop_words = list ( map ( replaceRockDots ( ) , de_stop_words ) )
2017-09-21 12:05:32 +02:00
#LEMMAS = list(map(replaceRockDots(),LEMMAS))
#VORNAMEN = list(map(replaceRockDots(),VORNAMEN))
2017-09-20 15:22:13 +02:00
de_stop_words = list ( map ( textacy . preprocess . normalize_whitespace , de_stop_words ) )
2017-09-21 12:05:32 +02:00
#LEMMAS = list(map(textacy.preprocess.normalize_whitespace,LEMMAS))
#VORNAMEN = list(map(textacy.preprocess.normalize_whitespace,VORNAMEN))
2017-09-20 15:22:13 +02:00
2017-09-21 12:05:32 +02:00
#textacy.fileio.write_file_lines(LEMMAS,"lemmas.txt")
#textacy.fileio.write_file_lines(VORNAMEN,"firstnames.txt")
textacy . fileio . write_file_lines ( de_stop_words , " german_stopwords.txt " )
2017-09-20 15:22:13 +02:00
2017-09-21 12:05:32 +02:00
"""
2017-10-17 10:13:49 +02:00
2017-09-20 15:22:13 +02:00
end = time . time ( )
print ( " \n \n \n Time Elapsed Topic: {0} \n \n " . format ( end - start ) )
2017-10-02 14:31:33 +02:00