2017-08-29 15:01:17 +02:00
# -*- coding: utf-8 -*-
import csv
import random
import re
import spacy
import textacy
import sys
2017-08-30 12:56:59 +02:00
import xml . etree . ElementTree as ET
2017-08-29 15:01:17 +02:00
"""
import keras
import numpy as np
from keras . layers import Dense , SimpleRNN , LSTM , TimeDistributed , Dropout
from keras . models import Sequential
import keras . backend as K
"""
csv . field_size_limit ( sys . maxsize )
2017-08-31 14:54:01 +02:00
"""
2017-08-29 15:01:17 +02:00
def getFirstSynonym ( word , thesaurus_gen ) :
word = word . lower ( )
# TODO word cleaning https://stackoverflow.com/questions/3939361/remove-specific-characters-from-a-string-in-python
# durch den thesaurrus iterieren
for syn_block in thesaurus_gen : # syn_block ist eine liste mit Synonymen
# durch den synonymblock iterieren
for syn in syn_block :
2017-08-30 12:56:59 +02:00
syn = syn . lower ( ) . split ( " " ) if not re . match ( r ' \ A[ \ w-]+ \ Z ' , syn ) else syn # aus synonym mach liste (um evtl. sätze zu identifieziren)
2017-08-29 15:01:17 +02:00
# falls das wort in dem synonym enthalten ist (also == einem Wort in der liste ist)
if word in syn :
# Hauptform suchen
if " auptform " in syn :
# nicht ausgeben, falls es in Klammern steht
for w in syn :
2017-08-30 12:56:59 +02:00
if not re . match ( r ' \ ([^)]+ \ ) ' , w ) and w is not None :
2017-08-29 15:01:17 +02:00
return w
# falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
if len ( syn ) == 1 :
w = syn [ 0 ]
2017-08-30 12:56:59 +02:00
if not re . match ( r ' \ ([^)]+ \ ) ' , w ) and w is not None :
2017-08-29 15:01:17 +02:00
return w
2017-08-30 12:56:59 +02:00
return word # zur Not die eingabe ausgeben
2017-08-29 15:01:17 +02:00
2017-08-31 14:54:01 +02:00
"""
"""
def cleanText ( string , custom_stopwords = None , custom_symbols = None , custom_words = None , customPreprocessing = None , lemmatize = False , normalize_synonyms = False ) :
2017-08-29 15:01:17 +02:00
# use preprocessing
if customPreprocessing is not None :
string = customPreprocessing ( string )
if custom_stopwords is not None :
custom_stopwords = custom_stopwords
else :
custom_stopwords = [ ]
if custom_words is not None :
custom_words = custom_words
else :
custom_words = [ ]
if custom_symbols is not None :
custom_symbols = custom_symbols
else :
custom_symbols = [ ]
# custom stoplist
# https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import
stop_words = __import__ ( " spacy. " + PARSER . lang , globals ( ) , locals ( ) , [ ' object ' ] ) . STOP_WORDS
stoplist = list ( stop_words ) + custom_stopwords
# List of symbols we don't care about either
symbols = [ " ----- " , " --- " , " ... " , " “ " , " ” " , " . " , " - " , " < " , " > " , " , " , " ? " , " ! " , " .. " , " n’ t " , " n ' t " , " | " , " || " , " ; " , " : " , " … " , " ’ s" , " ' s " , " . " , " ( " , " ) " , " [ " , " ] " , " # " ] + custom_symbols
# get rid of newlines
string = string . strip ( ) . replace ( " \n " , " " ) . replace ( " \r " , " " )
# replace twitter
mentionFinder = re . compile ( r " @[a-z0-9_] { 1,15} " , re . IGNORECASE )
string = mentionFinder . sub ( " MENTION " , string )
# replace emails
emailFinder = re . compile ( r " \ b[A-Z0-9._ % +-]+@[A-Z0-9.-]+ \ .[A-Z] { 2,} \ b " , re . IGNORECASE )
string = emailFinder . sub ( " EMAIL " , string )
# replace urls
urlFinder = re . compile ( r " ^(?:https?: \ / \ /)?(?:www \ .)?[a-zA-Z0-9./]+$ " , re . IGNORECASE )
string = urlFinder . sub ( " URL " , string )
# replace HTML symbols
string = string . replace ( " & " , " and " ) . replace ( " > " , " > " ) . replace ( " < " , " < " )
# parse with spaCy
spacy_doc = PARSER ( string )
tokens = [ ]
added_entities = [ " WORK_OF_ART " , " ORG " , " PRODUCT " , " LOC " ] #,"PERSON"]
2017-08-31 14:54:01 +02:00
added_POS = [ " NOUN " ] #, "NUM" ]#,"VERB","ADJ"] #IDEE NUM mit in den Corpus aufnehmen, aber fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013
2017-08-29 15:01:17 +02:00
# append Tokens to a list
for tok in spacy_doc :
if tok . pos_ in added_POS :
if lemmatize :
tokens . append ( tok . lemma_ . lower ( ) . strip ( ) )
else :
tokens . append ( tok . text . lower ( ) . strip ( ) )
# add entities
if tok . ent_type_ in added_entities :
tokens . append ( tok . text . lower ( ) )
# remove stopwords
tokens = [ tok for tok in tokens if tok not in stoplist ]
# remove symbols
tokens = [ tok for tok in tokens if tok not in symbols ]
# remove custom_words
tokens = [ tok for tok in tokens if tok not in custom_words ]
# remove single characters
tokens = [ tok for tok in tokens if len ( tok ) > 1 ]
# remove large strings of whitespace
2017-08-31 14:54:01 +02:00
remove_large_strings_of_whitespace ( " " . join ( tokens ) )
#idee abkürzungen auflösen (v.a. TU -> Technische Universität)
if normalize_synonyms :
tokens = [ str ( getFirstSynonym ( tok , THESAURUS_list ) ) for tok in tokens ]
2017-08-29 15:01:17 +02:00
return " " . join ( tokens )
2017-08-31 14:54:01 +02:00
def remove_large_strings_of_whitespace ( sentence ) :
whitespaceFinder = re . compile ( r ' ( \ r \ n| \ r| \ n) ' , re . IGNORECASE )
sentence = whitespaceFinder . sub ( " " , sentence )
tokenlist = sentence . split ( " " )
while " " in tokenlist :
tokenlist . remove ( " " )
while " " in tokenlist :
tokenlist . remove ( " " )
return " " . join ( tokenlist )
"""
"""
def generateFromXML ( path2xml , textfield = ' Beschreibung ' , clean = False , normalize_Synonyms = False , lemmatize = False ) :
2017-08-29 15:01:17 +02:00
import xml . etree . ElementTree as ET
tree = ET . parse ( path2xml , ET . XMLParser ( encoding = " utf-8 " ) )
root = tree . getroot ( )
2017-08-31 14:54:01 +02:00
for ticket in root :
metadata = { }
text = " ERROR "
for field in ticket :
if field . tag == textfield :
if clean :
text = cleanText_words ( field . text , PARSER , normalize_synonyms = normalize_Synonyms , lemmatize = lemmatize )
else :
text = field . text
else :
#idee hier auch cleanen?
metadata [ field . tag ] = field . text
yield text , metadata
"""
LANGUAGE = ' de '
PARSER = spacy . load ( LANGUAGE )
def generateTextfromXML ( path2xml , textfield = ' Beschreibung ' , clean = False , normalize_Synonyms = False , lemmatize = False ) :
import xml . etree . ElementTree as ET
tree = ET . parse ( path2xml , ET . XMLParser ( encoding = " utf-8 " ) )
root = tree . getroot ( )
"""
2017-08-30 12:56:59 +02:00
for subject in root . iter ( textfield ) :
2017-08-29 15:01:17 +02:00
if clean :
yield cleanText ( subject . text )
else :
yield subject . text
2017-08-31 14:54:01 +02:00
"""
for ticket in root :
text = " ERROR "
for field in ticket :
if field . tag == textfield :
if clean :
text = cleanText_words ( field . text , normalize_synonyms = normalize_Synonyms , lemmatize = lemmatize )
else :
text = field . text
yield text
2017-08-29 15:01:17 +02:00
2017-08-31 14:54:01 +02:00
def generateMetadatafromXML ( path2xml , textfield = ' Beschreibung ' ) : #,keys_to_clean=["Loesung","Zusammenfassung"]):
2017-08-29 15:01:17 +02:00
import xml . etree . ElementTree as ET
tree = ET . parse ( path2xml , ET . XMLParser ( encoding = " utf-8 " ) )
root = tree . getroot ( )
2017-08-31 14:54:01 +02:00
"""
2017-08-29 15:01:17 +02:00
metadata = dict . fromkeys ( keys )
for ticket in root . findall ( ' ticket ' ) :
for key in metadata :
2017-08-30 12:56:59 +02:00
metadata [ key ] = ticket . find ( key ) . text
2017-08-29 15:01:17 +02:00
yield metadata
2017-08-31 14:54:01 +02:00
"""
2017-08-30 12:56:59 +02:00
for ticket in root :
metadata = { }
for field in ticket :
2017-08-31 14:54:01 +02:00
if field . tag != textfield :
if field . tag == " Zusammenfassung " :
# idee lösung nur whitespace entfernen, zusammenfassung auch von symbolen befreien
metadata [ field . tag ] = cleanText_symbols ( field . text )
elif field . tag == " Loesung " :
metadata [ field . tag ] = remove_whitespace ( field . text )
2017-08-30 12:56:59 +02:00
else :
2017-08-31 14:54:01 +02:00
metadata [ field . tag ] = field . text
2017-08-29 15:01:17 +02:00
2017-08-31 14:54:01 +02:00
yield metadata
2017-08-29 15:01:17 +02:00
2017-08-31 14:54:01 +02:00
def cleanText_symbols ( string , parser = PARSER , custom_symbols = None , keep = None ) :
"""
https : / / spacy . io / docs / usage / pos - tagging
2017-08-29 15:01:17 +02:00
2017-08-31 14:54:01 +02:00
cleans text from PUNCT , NUM , whitespaces , newlines , and the following list of symbols :
2017-08-29 15:01:17 +02:00
2017-08-31 14:54:01 +02:00
[ " ----- " , " --- " , " ... " , " “ " , " ” " , " . " , " - " , " < " , " > " , " , " , " ? " , " ! " , " .. " , " n’ t " , " n ' t " , " | " , " || " , " ; " , " : " , " … " , " ’ s" , " ' s " , " . " , " ( " , " ) " , " [ " , " ] " , " # " ]
2017-08-29 15:01:17 +02:00
2017-08-31 14:54:01 +02:00
"""
if custom_symbols is not None :
custom_symbols = custom_symbols
else :
custom_symbols = [ ]
2017-08-29 15:01:17 +02:00
2017-08-31 14:54:01 +02:00
if keep is not None :
keep = keep
else :
keep = [ ]
2017-08-29 15:01:17 +02:00
2017-08-31 14:54:01 +02:00
# List of symbols we don't care about
symbols = [ " ----- " , " --- " , " ... " , " “ " , " ” " , " . " , " - " , " < " , " > " , " , " , " ? " , " ! " , " .. " , " n’ t " , " n ' t " , " | " , " || " , " ; " , " : " , " … " , " ’ s" , " ' s " , " . " , " ( " , " ) " , " [ " , " ] " , " # " ] + custom_symbols
2017-08-29 15:01:17 +02:00
2017-08-31 14:54:01 +02:00
# parse with spaCy
spacy_doc = parser ( string )
tokens = [ ]
2017-08-29 15:01:17 +02:00
2017-08-31 14:54:01 +02:00
pos = [ " NUM " , " SPACE " , " PUNCT " ]
for p in keep :
pos . remove ( p )
2017-08-29 15:01:17 +02:00
2017-08-31 14:54:01 +02:00
# append Tokens to a list
for tok in spacy_doc :
if tok . pos_ not in pos and tok . text not in symbols :
tokens . append ( tok . text )
2017-08-29 15:01:17 +02:00
2017-08-31 14:54:01 +02:00
return " " . join ( tokens )
2017-08-29 15:01:17 +02:00
2017-08-31 14:54:01 +02:00
def cleanText_words ( string , parser = PARSER , custom_stopwords = None , custom_words = None , customPreprocessing = cleanText_symbols , lemmatize = False , normalize_synonyms = False ) :
2017-08-29 15:01:17 +02:00
2017-08-31 14:54:01 +02:00
# use preprocessing
if customPreprocessing is not None :
string = customPreprocessing ( string )
2017-08-30 12:56:59 +02:00
2017-08-31 14:54:01 +02:00
if custom_stopwords is not None :
custom_stopwords = custom_stopwords
else :
custom_stopwords = [ ]
if custom_words is not None :
custom_words = custom_words
else :
custom_words = [ ]
# custom stoplist
# https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import
stop_words = __import__ ( " spacy. " + parser . lang , globals ( ) , locals ( ) , [ ' object ' ] ) . STOP_WORDS
stoplist = list ( stop_words ) + custom_stopwords
# replace twitter
mentionFinder = re . compile ( r " @[a-z0-9_] { 1,15} " , re . IGNORECASE )
string = mentionFinder . sub ( " MENTION " , string )
# replace emails
emailFinder = re . compile ( r " \ b[A-Z0-9._ % +-]+@[A-Z0-9.-]+ \ .[A-Z] { 2,} \ b " , re . IGNORECASE )
string = emailFinder . sub ( " EMAIL " , string )
# replace urls
urlFinder = re . compile ( r " ^(?:https?: \ / \ /)?(?:www \ .)?[a-zA-Z0-9./]+$ " , re . IGNORECASE )
string = urlFinder . sub ( " URL " , string )
# replace HTML symbols
string = string . replace ( " & " , " and " ) . replace ( " > " , " > " ) . replace ( " < " , " < " )
# parse with spaCy
spacy_doc = parser ( string )
tokens = [ ]
2017-08-29 15:01:17 +02:00
2017-08-31 14:54:01 +02:00
added_entities = [ " WORK_OF_ART " , " ORG " , " PRODUCT " , " LOC " ] #,"PERSON"]
added_POS = [ " NOUN " ] #, "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013
# append Tokens to a list
for tok in spacy_doc :
if tok . pos_ in added_POS :
if lemmatize :
tokens . append ( tok . lemma_ . lower ( ) . strip ( ) )
else :
tokens . append ( tok . text . lower ( ) . strip ( ) )
2017-08-29 15:01:17 +02:00
2017-08-31 14:54:01 +02:00
# add entities
if tok . ent_type_ in added_entities :
tokens . append ( tok . text . lower ( ) )
2017-08-29 15:01:17 +02:00
2017-08-31 14:54:01 +02:00
# remove stopwords
tokens = [ tok for tok in tokens if tok not in stoplist ]
2017-08-29 15:01:17 +02:00
2017-08-31 14:54:01 +02:00
# remove custom_words
tokens = [ tok for tok in tokens if tok not in custom_words ]
2017-08-29 15:01:17 +02:00
2017-08-31 14:54:01 +02:00
# remove single characters
tokens = [ tok for tok in tokens if len ( tok ) > 1 ]
2017-08-29 15:01:17 +02:00
2017-08-31 14:54:01 +02:00
# remove large strings of whitespace
#remove_whitespace(" ".join(tokens))
2017-08-29 15:01:17 +02:00
2017-08-31 14:54:01 +02:00
#idee abkürzungen auflösen (v.a. TU -> Technische Universität)
2017-08-29 15:01:17 +02:00
2017-08-31 14:54:01 +02:00
if normalize_synonyms :
tokens = [ str ( getFirstSynonym ( tok , THESAURUS_list ) ) for tok in tokens ]
2017-08-29 15:01:17 +02:00
2017-08-31 14:54:01 +02:00
return " " . join ( set ( tokens ) )
def remove_whitespace ( sentence ) :
whitespaceFinder = re . compile ( r ' ( \ r \ n| \ r| \ n|( \ s)+) ' , re . IGNORECASE )
sentence = whitespaceFinder . sub ( " " , sentence )
return sentence
def getFirstSynonym ( word , thesaurus_gen ) :
word = word . lower ( )
# durch den thesaurrus iterieren
for syn_block in thesaurus_gen : # syn_block ist eine liste mit Synonymen
for syn in syn_block :
syn = syn . lower ( )
if re . match ( r ' \ A[ \ w-]+ \ Z ' , syn ) : # falls syn einzelwort ist
if word == syn :
return getHauptform ( syn_block , word )
else : # falls es ein satz ist
if word in syn :
return getHauptform ( syn_block , word )
return word # zur Not, das ursrpüngliche Wort zurückgeben
def getHauptform ( syn_block , word , default_return_first_Syn = False ) :
for syn in syn_block :
syn = syn . lower ( )
if " hauptform " in syn and len ( syn . split ( " " ) ) < = 2 :
# nicht ausgeben, falls es in Klammern steht
for w in syn . split ( " " ) :
if not re . match ( r ' \ ([^)]+ \ ) ' , w ) :
return w
if default_return_first_Syn :
# falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
for w in syn_block :
if not re . match ( r ' \ ([^)]+ \ ) ' , w ) :
return w
return word # zur Not, das ursrpüngliche Wort zurückgeben
def printRandomDoc ( textacyCorpus ) :
print ( )
print ( " len(textacyCorpus) = %i " % len ( textacyCorpus ) )
randIndex = int ( ( len ( textacyCorpus ) - 1 ) * random . random ( ) )
print ( " Index: {0} ; Text: {1} ; Metadata: {2} " . format ( randIndex , textacyCorpus [ randIndex ] . text , textacyCorpus [ randIndex ] . metadata ) )
print ( )
####################'####################'####################'####################'####################'##############
import de_core_news_md
DATAPATH = " ticketSamples.xml "
DATAPATH_thesaurus = " openthesaurus.csv "
normalize_Synonyms = True
clean = True
lemmatize = True
custom_words = [ " grüßen " , " fragen " ]
####################'####################'####################'####################'####################'##############
#PARSER = de_core_news_md.load()
THESAURUS_list = list ( textacy . fileio . read_csv ( DATAPATH_thesaurus , delimiter = " ; " ) ) ## !!!!!! list wichtig, da sonst nicht die gleichen Synonyme zurückgegeben werden, weil der generator während der laufzeit pickt
## files to textacy-corpus
textacyCorpus = textacy . Corpus ( PARSER )
print ( " add texts to textacy-corpus... " )
textacyCorpus . add_texts ( texts = generateTextfromXML ( DATAPATH , normalize_Synonyms = normalize_Synonyms , clean = clean , lemmatize = lemmatize ) , metadatas = generateMetadatafromXML ( DATAPATH ) )
#for txt, dic in generateFromXML(DATAPATH, normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize):
# textacyCorpus.add_text(txt,dic)
for doc in textacyCorpus :
print ( doc . metadata )
print ( doc . text )
#print(textacyCorpus[2].text)
#printRandomDoc(textacyCorpus)
#print(textacyCorpus[len(textacyCorpus)-1].text)
2017-08-29 15:01:17 +02:00
2017-08-31 14:54:01 +02:00
print ( )
print ( )
2017-08-29 15:01:17 +02:00