2017-09-19 14:42:38 +02:00
# -*- coding: utf-8 -*-
2017-10-17 10:13:49 +02:00
2017-09-19 14:42:38 +02:00
import re
2017-09-25 13:12:23 +02:00
import time
2017-10-09 12:50:34 +02:00
import json
2017-09-20 15:22:13 +02:00
2017-10-17 10:13:49 +02:00
#import spacy
#import textacy
2017-10-12 15:57:56 +02:00
from functools import reduce
2017-09-19 14:42:38 +02:00
2017-09-25 13:12:23 +02:00
start = time . time ( )
2017-10-02 14:31:33 +02:00
import enchant
2017-09-25 13:12:23 +02:00
from datetime import datetime
import xml . etree . ElementTree as ET
2017-10-10 14:42:09 +02:00
"""
2017-10-09 12:50:34 +02:00
PARSER = spacy . load ( " de " )
2017-10-16 14:01:38 +02:00
corpi = textacy . Corpus ( PARSER )
2017-10-09 12:50:34 +02:00
testcontetn = [
" fdsfdsfsd " ,
" juzdtjlkö " ,
" gfadojplk "
]
testmetda = [
{ " categoryName " : " zhb " , " Solution " : " " , " Subject " : " schulungstest " } ,
{ " categoryName " : " neuanschluss " , " Solution " : " subject " , " Subject " : " telephone contract " } ,
{ " categoryName " : " zhb " , " Solution " : " " , " Subject " : " setuji " }
]
def makecontent ( testcontetn ) :
for content in testcontetn :
yield content
def makemeta ( testmetda ) :
for metdata in testmetda :
yield metdata
2017-10-16 14:01:38 +02:00
corpi . add_texts (
2017-10-09 12:50:34 +02:00
makecontent ( testcontetn ) ,
makemeta ( testmetda )
)
2017-10-16 14:01:38 +02:00
print ( corpi )
2017-10-10 14:42:09 +02:00
"""
2017-10-12 15:57:56 +02:00
2017-10-16 14:01:38 +02:00
2017-10-17 10:13:49 +02:00
"""
2017-10-16 14:01:38 +02:00
def load_corpus ( corpus_path , corpus_name , lang = " de " ) :
from pathlib import Path
# load parser
parser = spacy . load ( lang )
stringstorepath = corpus_path + str ( lang ) + ' _parser ' + ' /vocab/strings.json '
with open ( stringstorepath ) as file :
parser . vocab . strings . load ( file )
vocabpath = Path ( corpus_path + str ( lang ) + ' _parser ' + ' /vocab/lexemes.bin ' )
parser . vocab . load_lexemes ( vocabpath )
corpus = textacy . Corpus ( parser )
contentpath = corpus_path + corpus_name + " _content.bin "
metapath = corpus_path + corpus_name + " _meta.json "
metadata_stream = textacy . fileio . read_json_lines ( metapath )
spacy_docs = textacy . fileio . read_spacy_docs ( corpus . spacy_vocab , contentpath )
for spacy_doc , metadata in zip ( spacy_docs , metadata_stream ) :
corpus . add_doc (
textacy . Doc ( spacy_doc , lang = corpus . spacy_lang , metadata = metadata ) )
return corpus
2017-10-17 10:13:49 +02:00
"""
2017-10-16 14:01:38 +02:00
import os
a = " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_1.txt "
b = " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_2.txt "
d = " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_3.txt "
c = " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/en_stopwords_1.txt "
2017-10-17 10:13:49 +02:00
scriptpath = os . path . dirname ( os . path . realpath ( __file__ ) )
2017-10-16 14:01:38 +02:00
2017-10-17 10:13:49 +02:00
"""
2017-10-12 15:57:56 +02:00
# THESAURUS
lexicalentries = " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries_small.xml "
lexicalentries = " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml "
synsets = " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml "
def build_thesaurus ( path2lexicalentries ) : #, path2synsets):
lextree = ET . parse ( path2lexicalentries , ET . XMLParser ( encoding = " utf-8 " ) )
#syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
lexroot = lextree . getroot ( )
#synroot = syntree.getroot()
word2synsets = { }
template = { " w1 " : [ " s1 " , " s2 " ] }
for ro in lexroot :
for elem in ro :
if elem . tag == " LexicalEntry " :
lex_dictlist = [ subentry . attrib for subentry in elem ]
synlist = [ ]
string = " WORD "
for lex_dict in lex_dictlist :
if " synset " in lex_dict . keys ( ) :
synset = lex_dict [ " synset " ]
synlist . append ( synset )
if ' writtenForm ' in lex_dict . keys ( ) :
string = ( lex_dict [ " writtenForm " ] )
# replaceRockDots
string = re . sub ( r ' [ß] ' , " ss " , string )
string = re . sub ( r ' [ö] ' , " oe " , string )
string = re . sub ( r ' [ü] ' , " ue " , string )
string = re . sub ( r ' [ä] ' , " ae " , string )
2017-10-11 17:16:04 +02:00
2017-10-12 15:57:56 +02:00
# alle punkte raus
string = re . sub ( r ' [.] ' , " " , string )
2017-10-11 17:16:04 +02:00
2017-10-12 15:57:56 +02:00
# alles in klammern raus
string = re . sub ( r " \ ((.*) \ ) " , " " , string )
2017-10-11 17:16:04 +02:00
2017-10-12 15:57:56 +02:00
# längeres leerzeichen normalisieren
string = textacy . preprocess . normalize_whitespace ( string )
2017-10-11 17:16:04 +02:00
2017-10-12 15:57:56 +02:00
string = string . lower ( ) . strip ( )
2017-10-11 17:16:04 +02:00
2017-10-12 15:57:56 +02:00
word2synsets [ string ] = synlist
2017-10-11 17:16:04 +02:00
2017-10-12 15:57:56 +02:00
synset2Words = { }
template = { " s1 " : [ " w1 " , " w2 " ] }
2017-10-11 17:16:04 +02:00
2017-10-12 15:57:56 +02:00
for word , synset in word2synsets . items ( ) :
for syn in synset :
if syn not in synset2Words . keys ( ) :
synset2Words [ syn ] = [ word ]
else :
synset2Words [ syn ] . append ( word )
2017-10-11 17:16:04 +02:00
2017-10-12 15:57:56 +02:00
# nach anzhal der wörter in den strings sortieren
for synset in word2synsets . values ( ) :
synset . sort ( key = lambda x : len ( x . split ( ) ) )
2017-10-11 17:16:04 +02:00
2017-10-12 15:57:56 +02:00
thesaurus = { }
thesaurus_template = { " w1 " : " mainsyn " }
2017-10-11 17:16:04 +02:00
2017-10-12 15:57:56 +02:00
for word , synset in word2synsets . items ( ) :
try :
thesaurus [ word ] = synset2Words [ synset [ 0 ] ] [ 0 ] #Ann.: erstes synonym ist das Hauptsynonym
except :
pass
return thesaurus
2017-10-11 17:16:04 +02:00
2017-10-17 10:13:49 +02:00
2017-10-12 15:57:56 +02:00
for r in synroot :
for element in r :
2017-10-11 17:16:04 +02:00
2017-10-12 15:57:56 +02:00
if element . tag == " Synset " :
synset = [ ]
attrib = element . attrib
id = attrib [ " id " ]
2017-10-11 17:16:04 +02:00
2017-10-12 15:57:56 +02:00
if id not in synset2Words . keys ( ) :
synset2Words [ id ] = " WORD "
2017-10-17 10:13:49 +02:00
"""
2017-10-11 17:16:04 +02:00
"""
2017-10-10 14:42:09 +02:00
from postal . parser import parse_address
address = " Nicolas Rauner LS Biomaterialien und Polymerwissenschaften Fakultät Bio- und Chemieingenieurwesen TU Dortmund D-44227 Dortmund Tel: + 49-(0)231 / 755 - 3015 Fax: + 49-(0)231 / 755 - 2480 "
print ( parse_address ( address ) )
address = " Technische Universität Dortmund Maschinenbau/Lehrstuhl für Förder- und Lagerwesen LogistikCampus Joseph-von-Fraunhofer-Str. 2-4 D-44227 Dortmund "
print ( parse_address ( address ) )
2017-10-11 17:16:04 +02:00
"""
2017-10-09 12:50:34 +02:00
2017-10-10 14:42:09 +02:00
"""
2017-10-16 14:01:38 +02:00
corpus_path = " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/ "
2017-10-09 12:50:34 +02:00
corpus_name = " testcorpus "
2017-10-10 14:42:09 +02:00
2017-10-16 14:01:38 +02:00
#corpi.save(corpus_path, name=corpus_name, compression=corpus_compression)
#corpi = textacy.Corpus.load(corpus_path, name=corpus_name, compression=corpus_compression)
2017-10-09 12:50:34 +02:00
import pathlib
strings_path = pathlib . Path ( corpus_path + ' strings.json ' )
path_lexemes_bin_ = pathlib . Path ( corpus_path + ' lexemes.bin ' )
PARSER . vocab . dump ( path_lexemes_bin_ )
nlp . vocab . load_lexemes ( path_lexemes_bin_ )
2017-10-10 14:42:09 +02:00
2017-10-09 12:50:34 +02:00
def save_corpus ( corpus_path , corpus_name ) :
# save stringstore
stringstore_path = corpus_path + corpus_name + ' _strings.json '
with open ( stringstore_path , " w " ) as file :
PARSER . vocab . strings . dump ( file )
#save content
contentpath = corpus_path + corpus_name + " _content.bin "
2017-10-16 14:01:38 +02:00
textacy . fileio . write_spacy_docs ( ( doc . spacy_doc for doc in corpi ) , contentpath )
2017-10-09 12:50:34 +02:00
#save meta
metapath = corpus_path + corpus_name + " _meta.json "
2017-10-16 14:01:38 +02:00
textacy . fileio . write_json_lines ( ( doc . metadata for doc in corpi ) , metapath )
2017-10-09 12:50:34 +02:00
def load_corpus ( corpus_path , corpus_name ) :
# load new lang
nlp = spacy . load ( " de " )
#load stringstore
stringstore_path = corpus_path + corpus_name + ' _strings.json '
with open ( stringstore_path , " r " ) as file :
nlp . vocab . strings . load ( file )
2017-10-16 14:01:38 +02:00
# define corpi
corpi = textacy . Corpus ( nlp )
2017-10-09 12:50:34 +02:00
# load meta
metapath = corpus_path + corpus_name + " _meta.json "
metadata_stream = textacy . fileio . read_json_lines ( metapath )
#load content
contentpath = corpus_path + corpus_name + " _content.bin "
2017-10-16 14:01:38 +02:00
spacy_docs = textacy . fileio . read_spacy_docs ( corpi . spacy_vocab , contentpath )
2017-10-09 12:50:34 +02:00
for spacy_doc , metadata in zip ( spacy_docs , metadata_stream ) :
2017-10-16 14:01:38 +02:00
corpi . add_doc (
textacy . Doc ( spacy_doc , lang = corpi . spacy_lang , metadata = metadata ) )
2017-10-09 12:50:34 +02:00
2017-10-16 14:01:38 +02:00
return corpi
2017-10-09 12:50:34 +02:00
save_corpus ( corpus_path , corpus_name )
print ( load_corpus ( corpus_path , corpus_name ) )
2017-10-10 14:42:09 +02:00
"""
2017-09-25 13:12:23 +02:00
2017-09-28 12:42:05 +02:00
"""
def normalizeSynonyms ( default_return_first_Syn = False , parser = PARSER ) :
#return lambda doc : parser(" ".join([tok.lower_ for tok in doc]))
return lambda doc : parser ( " " . join ( [ getFirstSynonym ( tok . lower_ , THESAURUS , default_return_first_Syn = default_return_first_Syn ) for tok in doc ] ) )
def getFirstSynonym ( word , thesaurus , default_return_first_Syn = False ) :
if not isinstance ( word , str ) :
return str ( word )
word = word . lower ( )
# durch den thesaurrus iterieren
for syn_block in thesaurus : # syn_block ist eine liste mit Synonymen
for syn in syn_block :
syn = syn . lower ( )
if re . match ( r ' \ A[ \ w-]+ \ Z ' , syn ) : # falls syn einzelwort ist
if word == syn :
return str ( getHauptform ( syn_block , word , default_return_first_Syn = default_return_first_Syn ) )
else : # falls es ein satz ist
if word in syn :
return str ( getHauptform ( syn_block , word , default_return_first_Syn = default_return_first_Syn ) )
return str ( word ) # zur Not, das ursrpüngliche Wort zurückgeben
def getHauptform ( syn_block , word , default_return_first_Syn = False ) :
for syn in syn_block :
syn = syn . lower ( )
if " hauptform " in syn and len ( syn . split ( " " ) ) < = 2 :
# nicht ausgeben, falls es in Klammern steht#todo gibts macnmal?? klammern aus
for w in syn . split ( " " ) :
if not re . match ( r ' \ ([^)]+ \ ) ' , w ) :
return w
if default_return_first_Syn :
# falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
for w in syn_block :
if not re . match ( r ' \ ([^)]+ \ ) ' , w ) :
return w
return word # zur Not, das ursrpüngliche Wort zurückgeben
"""
2017-09-25 13:12:23 +02:00
2017-09-28 12:42:05 +02:00
"""
path2xml = " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/deWordNet.xml "
tree = ET . parse ( path2xml , ET . XMLParser ( encoding = " utf-8 " ) )
root = tree . getroot ( )
2017-09-26 11:03:09 +02:00
for r in root :
for element in r :
2017-09-28 12:42:05 +02:00
if element . tag == " Synset " :
attrib = element . attrib
2017-09-26 11:03:09 +02:00
for i , subentry in enumerate ( element ) :
if subentry . tag == " Lemma " and subentry . attrib [ " partOfSpeech " ] == " n " :
string = ( subentry . attrib [ " writtenForm " ] )
# replaceRockDots
string = re . sub ( r ' [ß] ' , " ss " , string )
string = re . sub ( r ' [ö] ' , " oe " , string )
string = re . sub ( r ' [ü] ' , " ue " , string )
string = re . sub ( r ' [ä] ' , " ae " , string )
# seperate_words_on_regex:
string = " " . join ( re . compile ( regex_specialChars ) . split ( string ) )
string_list = string . split ( )
if len ( string_list ) == 1 :
nomen . append ( string . lower ( ) . strip ( ) )
2017-09-28 12:42:05 +02:00
"""
2017-10-09 12:50:34 +02:00
"""
2017-10-02 14:31:33 +02:00
import re
from collections import Counter
2017-09-25 13:12:23 +02:00
2017-10-02 14:31:33 +02:00
def words ( text ) : return re . findall ( r ' \ w+ ' , text . lower ( ) )
2017-09-25 13:12:23 +02:00
2017-10-02 14:31:33 +02:00
WORDS = Counter ( words ( open ( ' /home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt ' ) . read ( ) ) )
2017-09-25 13:12:23 +02:00
2017-10-02 14:31:33 +02:00
def P ( word , N = sum ( WORDS . values ( ) ) ) :
" Probability of `word`. "
return WORDS [ word ] / N
2017-09-25 13:12:23 +02:00
2017-10-02 14:31:33 +02:00
def correction ( word ) :
" Most probable spelling correction for word. "
return max ( candidates ( word ) , key = P )
2017-09-25 13:12:23 +02:00
2017-10-02 14:31:33 +02:00
def candidates ( word ) :
" Generate possible spelling corrections for word. "
return ( known ( [ word ] ) or known ( edits1 ( word ) ) or known ( edits2 ( word ) ) or [ word ] )
2017-09-25 13:12:23 +02:00
2017-10-02 14:31:33 +02:00
def known ( words ) :
" The subset of `words` that appear in the dictionary of WORDS. "
return set ( w for w in words if w in WORDS )
2017-09-25 13:12:23 +02:00
2017-10-02 14:31:33 +02:00
def edits1 ( word ) :
" All edits that are one edit away from `word`. "
letters = ' abcdefghijklmnopqrstuvwxyz '
splits = [ ( word [ : i ] , word [ i : ] ) for i in range ( len ( word ) + 1 ) ]
deletes = [ L + R [ 1 : ] for L , R in splits if R ]
transposes = [ L + R [ 1 ] + R [ 0 ] + R [ 2 : ] for L , R in splits if len ( R ) > 1 ]
replaces = [ L + c + R [ 1 : ] for L , R in splits if R for c in letters ]
inserts = [ L + c + R for L , R in splits for c in letters ]
return set ( deletes + transposes + replaces + inserts )
2017-09-25 13:12:23 +02:00
2017-10-02 14:31:33 +02:00
def edits2 ( word ) :
" All edits that are two edits away from `word`. "
return ( e2 for e1 in edits1 ( word ) for e2 in edits1 ( e1 ) )
2017-09-25 13:12:23 +02:00
2017-10-09 12:50:34 +02:00
"""
2017-09-25 13:12:23 +02:00
"""
2017-09-26 11:03:09 +02:00
### extract from derewo
2017-09-25 13:12:23 +02:00
2017-09-26 11:03:09 +02:00
#http://www1.ids-mannheim.de/kl/projekte/methoden/derewo.html
2017-09-25 13:12:23 +02:00
2017-09-26 11:03:09 +02:00
raw = textacy . fileio . read_file_lines ( " DeReKo-2014-II-MainArchive-STT.100000.freq " )
2017-09-25 13:12:23 +02:00
2017-09-26 11:03:09 +02:00
for line in raw :
line_list = line . split ( )
if line_list [ 2 ] == " NN " :
string = line_list [ 1 ] . lower ( )
2017-09-25 13:12:23 +02:00
2017-09-26 11:03:09 +02:00
# replaceRockDots
string = re . sub ( r ' [ß] ' , " ss " , string )
string = re . sub ( r ' [ö] ' , " oe " , string )
string = re . sub ( r ' [ü] ' , " ue " , string )
string = re . sub ( r ' [ä] ' , " ae " , string )
2017-09-25 13:12:23 +02:00
2017-09-26 11:03:09 +02:00
nomen . append ( string . lower ( ) . strip ( ) )
textacy . fileio . write_file_lines ( nomen , " nomen2.txt " )
2017-09-25 13:12:23 +02:00
"""
"""
2017-09-21 12:05:32 +02:00
stream = textacy . fileio . read_csv ( " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_2017-09-13.csv " , delimiter = " ; " )
content_collumn_name = " Description "
content_collumn = 9 # standardvalue
de_tickets = [ ]
en_tickets = [ ]
misc_tickets = [ ]
error_count = 0
for i , lst in enumerate ( stream ) :
if i == 0 :
de_tickets . append ( lst )
en_tickets . append ( lst )
misc_tickets . append ( lst )
else :
try :
content_collumn_ = lst [ content_collumn ]
if detect ( content_collumn_ ) == " de " :
de_tickets . append ( lst )
elif detect ( content_collumn_ ) == " en " :
en_tickets . append ( lst )
else :
misc_tickets . append ( lst )
except :
misc_tickets . append ( lst )
error_count + = 1
print ( error_count )
textacy . fileio . write_csv ( de_tickets , " M42-Export/de_tickets.csv " , delimiter = " ; " )
textacy . fileio . write_csv ( en_tickets , " M42-Export/en_tickets.csv " , delimiter = " ; " )
textacy . fileio . write_csv ( misc_tickets , " M42-Export/misc_tickets.csv " , delimiter = " ; " )
2017-09-19 14:42:38 +02:00
2017-09-25 13:12:23 +02:00
"""
2017-09-19 14:42:38 +02:00
"""
regex_specialChars = r ' [` \ -=~!#@,.$ % ^&*()_+ \ [ \ ] {} ; \' \\ : " |</>?] '
def stringcleaning ( stringstream , funclist ) :
for string in stringstream :
for f in funclist :
string = f ( string )
yield string
def seperate_words_on_regex ( regex = regex_specialChars ) :
return lambda string : " " . join ( re . compile ( regex ) . split ( string ) )
words = [
" uniaccount " ,
" nr54065467 " ,
" nr54065467 " ,
" 455a33c5, "
" tvt?= " ,
" tanja.saborowski@tu-dortmund.de " ,
" - " ,
" m-sw1-vl4053.itmc.tu-dortmund.de " ,
" ------problem-------- "
]
topLVLFinder = re . compile ( r ' \ .[a-z] { 2,3}( \ .[a-z] { 2,3})? ' , re . IGNORECASE )
specialFinder = re . compile ( r ' [` \ -=~!@#$ % ^&*()_+ \ [ \ ] {} ; \' \\ : " |<,./>?] ' , re . IGNORECASE )
for s in stringcleaning ( ( w for w in words ) , [ seperate_words_on_regex ( ) ] ) :
print ( s . strip ( ) )
#print(stringcleaning(w,string_comp))
#print(bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?',w)))
#print(bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]',w)))
#result = specialFinder.sub(" ", w)
#print(re.sub(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]'," ",w))
#print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w))
2017-09-20 15:22:13 +02:00
"""
"""
2017-09-21 12:05:32 +02:00
def replaceRockDots ( ) :
return lambda string : re . sub ( r ' [ß] ' , " ss " , ( re . sub ( r ' [ö] ' , " oe " , ( re . sub ( r ' [ü] ' , " ue " , ( re . sub ( r ' [ä] ' , " ae " , string . lower ( ) ) ) ) ) ) ) )
2017-09-20 15:22:13 +02:00
2017-09-21 12:05:32 +02:00
de_stop_words = list ( textacy . fileio . read_file_lines ( filepath = " german_stopwords_full.txt " ) )
2017-09-20 15:22:13 +02:00
#blob = Text(str(textacy.fileio.read_file("teststring.txt")))#,parser=PatternParser(pprint=True, lemmata=True))
#print(blob.entities)
de_stop_words = list ( map ( replaceRockDots ( ) , de_stop_words ) )
2017-09-21 12:05:32 +02:00
#LEMMAS = list(map(replaceRockDots(),LEMMAS))
#VORNAMEN = list(map(replaceRockDots(),VORNAMEN))
2017-09-20 15:22:13 +02:00
de_stop_words = list ( map ( textacy . preprocess . normalize_whitespace , de_stop_words ) )
2017-09-21 12:05:32 +02:00
#LEMMAS = list(map(textacy.preprocess.normalize_whitespace,LEMMAS))
#VORNAMEN = list(map(textacy.preprocess.normalize_whitespace,VORNAMEN))
2017-09-20 15:22:13 +02:00
2017-09-21 12:05:32 +02:00
#textacy.fileio.write_file_lines(LEMMAS,"lemmas.txt")
#textacy.fileio.write_file_lines(VORNAMEN,"firstnames.txt")
textacy . fileio . write_file_lines ( de_stop_words , " german_stopwords.txt " )
2017-09-20 15:22:13 +02:00
2017-09-21 12:05:32 +02:00
"""
2017-10-17 10:13:49 +02:00
2017-09-20 15:22:13 +02:00
end = time . time ( )
print ( " \n \n \n Time Elapsed Topic: {0} \n \n " . format ( end - start ) )
2017-10-02 14:31:33 +02:00