2017-09-19 14:42:38 +02:00
# -*- coding: utf-8 -*-
import re
2017-09-25 13:12:23 +02:00
import time
2017-10-09 12:50:34 +02:00
import json
2017-09-20 15:22:13 +02:00
2017-09-28 12:42:05 +02:00
import spacy
2017-09-19 14:42:38 +02:00
import textacy
2017-09-25 13:12:23 +02:00
start = time . time ( )
2017-10-02 14:31:33 +02:00
import enchant
2017-09-25 13:12:23 +02:00
from datetime import datetime
import xml . etree . ElementTree as ET
print ( datetime . now ( ) )
2017-10-10 14:42:09 +02:00
"""
2017-10-09 12:50:34 +02:00
PARSER = spacy . load ( " de " )
corpus = textacy . Corpus ( PARSER )
testcontetn = [
" fdsfdsfsd " ,
" juzdtjlkö " ,
" gfadojplk "
]
testmetda = [
{ " categoryName " : " zhb " , " Solution " : " " , " Subject " : " schulungstest " } ,
{ " categoryName " : " neuanschluss " , " Solution " : " subject " , " Subject " : " telephone contract " } ,
{ " categoryName " : " zhb " , " Solution " : " " , " Subject " : " setuji " }
]
def makecontent ( testcontetn ) :
for content in testcontetn :
yield content
def makemeta ( testmetda ) :
for metdata in testmetda :
yield metdata
corpus . add_texts (
makecontent ( testcontetn ) ,
makemeta ( testmetda )
)
print ( corpus )
2017-10-10 14:42:09 +02:00
"""
from postal . parser import parse_address
address = " Nicolas Rauner LS Biomaterialien und Polymerwissenschaften Fakultät Bio- und Chemieingenieurwesen TU Dortmund D-44227 Dortmund Tel: + 49-(0)231 / 755 - 3015 Fax: + 49-(0)231 / 755 - 2480 "
print ( parse_address ( address ) )
address = " Technische Universität Dortmund Maschinenbau/Lehrstuhl für Förder- und Lagerwesen LogistikCampus Joseph-von-Fraunhofer-Str. 2-4 D-44227 Dortmund "
print ( parse_address ( address ) )
2017-10-09 12:50:34 +02:00
2017-10-10 14:42:09 +02:00
"""
2017-10-09 12:50:34 +02:00
corpus_path = " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/ "
corpus_name = " testcorpus "
2017-10-10 14:42:09 +02:00
2017-10-09 12:50:34 +02:00
#corpus.save(corpus_path, name=corpus_name, compression=corpus_compression)
#corpus = textacy.Corpus.load(corpus_path, name=corpus_name, compression=corpus_compression)
import pathlib
strings_path = pathlib . Path ( corpus_path + ' strings.json ' )
path_lexemes_bin_ = pathlib . Path ( corpus_path + ' lexemes.bin ' )
PARSER . vocab . dump ( path_lexemes_bin_ )
nlp . vocab . load_lexemes ( path_lexemes_bin_ )
2017-10-10 14:42:09 +02:00
2017-10-09 12:50:34 +02:00
def save_corpus ( corpus_path , corpus_name ) :
# save stringstore
stringstore_path = corpus_path + corpus_name + ' _strings.json '
with open ( stringstore_path , " w " ) as file :
PARSER . vocab . strings . dump ( file )
#save content
contentpath = corpus_path + corpus_name + " _content.bin "
textacy . fileio . write_spacy_docs ( ( doc . spacy_doc for doc in corpus ) , contentpath )
#save meta
metapath = corpus_path + corpus_name + " _meta.json "
textacy . fileio . write_json_lines ( ( doc . metadata for doc in corpus ) , metapath )
def load_corpus ( corpus_path , corpus_name ) :
# load new lang
nlp = spacy . load ( " de " )
#load stringstore
stringstore_path = corpus_path + corpus_name + ' _strings.json '
with open ( stringstore_path , " r " ) as file :
nlp . vocab . strings . load ( file )
# define corpus
corpus = textacy . Corpus ( nlp )
# load meta
metapath = corpus_path + corpus_name + " _meta.json "
metadata_stream = textacy . fileio . read_json_lines ( metapath )
#load content
contentpath = corpus_path + corpus_name + " _content.bin "
spacy_docs = textacy . fileio . read_spacy_docs ( corpus . spacy_vocab , contentpath )
for spacy_doc , metadata in zip ( spacy_docs , metadata_stream ) :
corpus . add_doc (
textacy . Doc ( spacy_doc , lang = corpus . spacy_lang , metadata = metadata ) )
return corpus
save_corpus ( corpus_path , corpus_name )
print ( load_corpus ( corpus_path , corpus_name ) )
2017-10-10 14:42:09 +02:00
"""
2017-09-25 13:12:23 +02:00
2017-09-28 12:42:05 +02:00
"""
def normalizeSynonyms ( default_return_first_Syn = False , parser = PARSER ) :
#return lambda doc : parser(" ".join([tok.lower_ for tok in doc]))
return lambda doc : parser ( " " . join ( [ getFirstSynonym ( tok . lower_ , THESAURUS , default_return_first_Syn = default_return_first_Syn ) for tok in doc ] ) )
def getFirstSynonym ( word , thesaurus , default_return_first_Syn = False ) :
if not isinstance ( word , str ) :
return str ( word )
word = word . lower ( )
# durch den thesaurrus iterieren
for syn_block in thesaurus : # syn_block ist eine liste mit Synonymen
for syn in syn_block :
syn = syn . lower ( )
if re . match ( r ' \ A[ \ w-]+ \ Z ' , syn ) : # falls syn einzelwort ist
if word == syn :
return str ( getHauptform ( syn_block , word , default_return_first_Syn = default_return_first_Syn ) )
else : # falls es ein satz ist
if word in syn :
return str ( getHauptform ( syn_block , word , default_return_first_Syn = default_return_first_Syn ) )
return str ( word ) # zur Not, das ursrpüngliche Wort zurückgeben
def getHauptform ( syn_block , word , default_return_first_Syn = False ) :
for syn in syn_block :
syn = syn . lower ( )
if " hauptform " in syn and len ( syn . split ( " " ) ) < = 2 :
# nicht ausgeben, falls es in Klammern steht#todo gibts macnmal?? klammern aus
for w in syn . split ( " " ) :
if not re . match ( r ' \ ([^)]+ \ ) ' , w ) :
return w
if default_return_first_Syn :
# falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
for w in syn_block :
if not re . match ( r ' \ ([^)]+ \ ) ' , w ) :
return w
return word # zur Not, das ursrpüngliche Wort zurückgeben
"""
2017-09-25 13:12:23 +02:00
2017-09-28 12:42:05 +02:00
"""
path2xml = " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/deWordNet.xml "
tree = ET . parse ( path2xml , ET . XMLParser ( encoding = " utf-8 " ) )
root = tree . getroot ( )
2017-09-26 11:03:09 +02:00
for r in root :
for element in r :
2017-09-28 12:42:05 +02:00
if element . tag == " Synset " :
attrib = element . attrib
2017-09-26 11:03:09 +02:00
for i , subentry in enumerate ( element ) :
if subentry . tag == " Lemma " and subentry . attrib [ " partOfSpeech " ] == " n " :
string = ( subentry . attrib [ " writtenForm " ] )
# replaceRockDots
string = re . sub ( r ' [ß] ' , " ss " , string )
string = re . sub ( r ' [ö] ' , " oe " , string )
string = re . sub ( r ' [ü] ' , " ue " , string )
string = re . sub ( r ' [ä] ' , " ae " , string )
# seperate_words_on_regex:
string = " " . join ( re . compile ( regex_specialChars ) . split ( string ) )
string_list = string . split ( )
if len ( string_list ) == 1 :
nomen . append ( string . lower ( ) . strip ( ) )
2017-09-28 12:42:05 +02:00
"""
2017-10-09 12:50:34 +02:00
"""
2017-10-02 14:31:33 +02:00
import re
from collections import Counter
2017-09-25 13:12:23 +02:00
2017-10-02 14:31:33 +02:00
def words ( text ) : return re . findall ( r ' \ w+ ' , text . lower ( ) )
2017-09-25 13:12:23 +02:00
2017-10-02 14:31:33 +02:00
WORDS = Counter ( words ( open ( ' /home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt ' ) . read ( ) ) )
2017-09-25 13:12:23 +02:00
2017-10-02 14:31:33 +02:00
def P ( word , N = sum ( WORDS . values ( ) ) ) :
" Probability of `word`. "
return WORDS [ word ] / N
2017-09-25 13:12:23 +02:00
2017-10-02 14:31:33 +02:00
def correction ( word ) :
" Most probable spelling correction for word. "
return max ( candidates ( word ) , key = P )
2017-09-25 13:12:23 +02:00
2017-10-02 14:31:33 +02:00
def candidates ( word ) :
" Generate possible spelling corrections for word. "
return ( known ( [ word ] ) or known ( edits1 ( word ) ) or known ( edits2 ( word ) ) or [ word ] )
2017-09-25 13:12:23 +02:00
2017-10-02 14:31:33 +02:00
def known ( words ) :
" The subset of `words` that appear in the dictionary of WORDS. "
return set ( w for w in words if w in WORDS )
2017-09-25 13:12:23 +02:00
2017-10-02 14:31:33 +02:00
def edits1 ( word ) :
" All edits that are one edit away from `word`. "
letters = ' abcdefghijklmnopqrstuvwxyz '
splits = [ ( word [ : i ] , word [ i : ] ) for i in range ( len ( word ) + 1 ) ]
deletes = [ L + R [ 1 : ] for L , R in splits if R ]
transposes = [ L + R [ 1 ] + R [ 0 ] + R [ 2 : ] for L , R in splits if len ( R ) > 1 ]
replaces = [ L + c + R [ 1 : ] for L , R in splits if R for c in letters ]
inserts = [ L + c + R for L , R in splits for c in letters ]
return set ( deletes + transposes + replaces + inserts )
2017-09-25 13:12:23 +02:00
2017-10-02 14:31:33 +02:00
def edits2 ( word ) :
" All edits that are two edits away from `word`. "
return ( e2 for e1 in edits1 ( word ) for e2 in edits1 ( e1 ) )
2017-09-25 13:12:23 +02:00
2017-10-09 12:50:34 +02:00
"""
2017-09-25 13:12:23 +02:00
"""
2017-09-26 11:03:09 +02:00
### extract from derewo
2017-09-25 13:12:23 +02:00
2017-09-26 11:03:09 +02:00
#http://www1.ids-mannheim.de/kl/projekte/methoden/derewo.html
2017-09-25 13:12:23 +02:00
2017-09-26 11:03:09 +02:00
raw = textacy . fileio . read_file_lines ( " DeReKo-2014-II-MainArchive-STT.100000.freq " )
2017-09-25 13:12:23 +02:00
2017-09-26 11:03:09 +02:00
for line in raw :
line_list = line . split ( )
if line_list [ 2 ] == " NN " :
string = line_list [ 1 ] . lower ( )
2017-09-25 13:12:23 +02:00
2017-09-26 11:03:09 +02:00
# replaceRockDots
string = re . sub ( r ' [ß] ' , " ss " , string )
string = re . sub ( r ' [ö] ' , " oe " , string )
string = re . sub ( r ' [ü] ' , " ue " , string )
string = re . sub ( r ' [ä] ' , " ae " , string )
2017-09-25 13:12:23 +02:00
2017-09-26 11:03:09 +02:00
nomen . append ( string . lower ( ) . strip ( ) )
textacy . fileio . write_file_lines ( nomen , " nomen2.txt " )
2017-09-25 13:12:23 +02:00
"""
"""
2017-09-21 12:05:32 +02:00
stream = textacy . fileio . read_csv ( " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_2017-09-13.csv " , delimiter = " ; " )
content_collumn_name = " Description "
content_collumn = 9 # standardvalue
de_tickets = [ ]
en_tickets = [ ]
misc_tickets = [ ]
error_count = 0
for i , lst in enumerate ( stream ) :
if i == 0 :
de_tickets . append ( lst )
en_tickets . append ( lst )
misc_tickets . append ( lst )
else :
try :
content_collumn_ = lst [ content_collumn ]
if detect ( content_collumn_ ) == " de " :
de_tickets . append ( lst )
elif detect ( content_collumn_ ) == " en " :
en_tickets . append ( lst )
else :
misc_tickets . append ( lst )
except :
misc_tickets . append ( lst )
error_count + = 1
print ( error_count )
textacy . fileio . write_csv ( de_tickets , " M42-Export/de_tickets.csv " , delimiter = " ; " )
textacy . fileio . write_csv ( en_tickets , " M42-Export/en_tickets.csv " , delimiter = " ; " )
textacy . fileio . write_csv ( misc_tickets , " M42-Export/misc_tickets.csv " , delimiter = " ; " )
2017-09-19 14:42:38 +02:00
2017-09-25 13:12:23 +02:00
"""
2017-09-19 14:42:38 +02:00
"""
regex_specialChars = r ' [` \ -=~!#@,.$ % ^&*()_+ \ [ \ ] {} ; \' \\ : " |</>?] '
def stringcleaning ( stringstream , funclist ) :
for string in stringstream :
for f in funclist :
string = f ( string )
yield string
def seperate_words_on_regex ( regex = regex_specialChars ) :
return lambda string : " " . join ( re . compile ( regex ) . split ( string ) )
words = [
" uniaccount " ,
" nr54065467 " ,
" nr54065467 " ,
" 455a33c5, "
" tvt?= " ,
" tanja.saborowski@tu-dortmund.de " ,
" - " ,
" m-sw1-vl4053.itmc.tu-dortmund.de " ,
" ------problem-------- "
]
topLVLFinder = re . compile ( r ' \ .[a-z] { 2,3}( \ .[a-z] { 2,3})? ' , re . IGNORECASE )
specialFinder = re . compile ( r ' [` \ -=~!@#$ % ^&*()_+ \ [ \ ] {} ; \' \\ : " |<,./>?] ' , re . IGNORECASE )
for s in stringcleaning ( ( w for w in words ) , [ seperate_words_on_regex ( ) ] ) :
print ( s . strip ( ) )
#print(stringcleaning(w,string_comp))
#print(bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?',w)))
#print(bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]',w)))
#result = specialFinder.sub(" ", w)
#print(re.sub(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]'," ",w))
#print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w))
2017-09-20 15:22:13 +02:00
"""
"""
2017-09-21 12:05:32 +02:00
def replaceRockDots ( ) :
return lambda string : re . sub ( r ' [ß] ' , " ss " , ( re . sub ( r ' [ö] ' , " oe " , ( re . sub ( r ' [ü] ' , " ue " , ( re . sub ( r ' [ä] ' , " ae " , string . lower ( ) ) ) ) ) ) ) )
2017-09-20 15:22:13 +02:00
2017-09-21 12:05:32 +02:00
de_stop_words = list ( textacy . fileio . read_file_lines ( filepath = " german_stopwords_full.txt " ) )
2017-09-20 15:22:13 +02:00
#blob = Text(str(textacy.fileio.read_file("teststring.txt")))#,parser=PatternParser(pprint=True, lemmata=True))
#print(blob.entities)
de_stop_words = list ( map ( replaceRockDots ( ) , de_stop_words ) )
2017-09-21 12:05:32 +02:00
#LEMMAS = list(map(replaceRockDots(),LEMMAS))
#VORNAMEN = list(map(replaceRockDots(),VORNAMEN))
2017-09-20 15:22:13 +02:00
de_stop_words = list ( map ( textacy . preprocess . normalize_whitespace , de_stop_words ) )
2017-09-21 12:05:32 +02:00
#LEMMAS = list(map(textacy.preprocess.normalize_whitespace,LEMMAS))
#VORNAMEN = list(map(textacy.preprocess.normalize_whitespace,VORNAMEN))
2017-09-20 15:22:13 +02:00
2017-09-21 12:05:32 +02:00
#textacy.fileio.write_file_lines(LEMMAS,"lemmas.txt")
#textacy.fileio.write_file_lines(VORNAMEN,"firstnames.txt")
textacy . fileio . write_file_lines ( de_stop_words , " german_stopwords.txt " )
2017-09-20 15:22:13 +02:00
2017-09-21 12:05:32 +02:00
"""
2017-09-20 15:22:13 +02:00
end = time . time ( )
print ( " \n \n \n Time Elapsed Topic: {0} \n \n " . format ( end - start ) )
2017-10-02 14:31:33 +02:00