2017-08-29 15:01:17 +02:00
# -*- coding: utf-8 -*-
import csv
import random
import re
import spacy
import textacy
import sys
2017-08-30 12:56:59 +02:00
import xml . etree . ElementTree as ET
2017-08-29 15:01:17 +02:00
"""
import keras
import numpy as np
from keras . layers import Dense , SimpleRNN , LSTM , TimeDistributed , Dropout
from keras . models import Sequential
import keras . backend as K
"""
csv . field_size_limit ( sys . maxsize )
def printRandomDoc ( textacyCorpus ) :
print ( )
2017-08-31 10:38:29 +02:00
2017-08-29 15:01:17 +02:00
print ( " len(textacyCorpus) = %i " % len ( textacyCorpus ) )
randIndex = int ( ( len ( textacyCorpus ) - 1 ) * random . random ( ) )
print ( " Index: {0} ; Text: {1} ; Metadata: {2} " . format ( randIndex , textacyCorpus [ randIndex ] . text , textacyCorpus [ randIndex ] . metadata ) )
2017-08-31 10:38:29 +02:00
print ( )
2017-08-29 15:01:17 +02:00
2017-08-31 10:38:29 +02:00
"""
2017-08-29 15:01:17 +02:00
def getFirstSynonym ( word , thesaurus_gen ) :
word = word . lower ( )
# TODO word cleaning https://stackoverflow.com/questions/3939361/remove-specific-characters-from-a-string-in-python
# durch den thesaurrus iterieren
for syn_block in thesaurus_gen : # syn_block ist eine liste mit Synonymen
# durch den synonymblock iterieren
for syn in syn_block :
2017-08-30 12:56:59 +02:00
syn = syn . lower ( ) . split ( " " ) if not re . match ( r ' \ A[ \ w-]+ \ Z ' , syn ) else syn # aus synonym mach liste (um evtl. sätze zu identifieziren)
2017-08-29 15:01:17 +02:00
# falls das wort in dem synonym enthalten ist (also == einem Wort in der liste ist)
if word in syn :
# Hauptform suchen
if " auptform " in syn :
# nicht ausgeben, falls es in Klammern steht
for w in syn :
2017-08-30 12:56:59 +02:00
if not re . match ( r ' \ ([^)]+ \ ) ' , w ) and w is not None :
2017-08-29 15:01:17 +02:00
return w
# falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
if len ( syn ) == 1 :
w = syn [ 0 ]
2017-08-30 12:56:59 +02:00
if not re . match ( r ' \ ([^)]+ \ ) ' , w ) and w is not None :
2017-08-29 15:01:17 +02:00
return w
2017-08-30 12:56:59 +02:00
return word # zur Not die eingabe ausgeben
2017-08-31 10:38:29 +02:00
"""
2017-08-29 15:01:17 +02:00
2017-08-31 10:38:29 +02:00
def cleanText ( string , custom_stopwords = None , custom_symbols = None , custom_words = None , customPreprocessing = None , lemmatize = False , normalize_synonyms = False ) :
2017-08-29 15:01:17 +02:00
# use preprocessing
if customPreprocessing is not None :
string = customPreprocessing ( string )
if custom_stopwords is not None :
custom_stopwords = custom_stopwords
else :
custom_stopwords = [ ]
if custom_words is not None :
custom_words = custom_words
else :
custom_words = [ ]
if custom_symbols is not None :
custom_symbols = custom_symbols
else :
custom_symbols = [ ]
# custom stoplist
# https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import
stop_words = __import__ ( " spacy. " + PARSER . lang , globals ( ) , locals ( ) , [ ' object ' ] ) . STOP_WORDS
stoplist = list ( stop_words ) + custom_stopwords
# List of symbols we don't care about either
symbols = [ " ----- " , " --- " , " ... " , " “ " , " ” " , " . " , " - " , " < " , " > " , " , " , " ? " , " ! " , " .. " , " n’ t " , " n ' t " , " | " , " || " , " ; " , " : " , " … " , " ’ s" , " ' s " , " . " , " ( " , " ) " , " [ " , " ] " , " # " ] + custom_symbols
# get rid of newlines
string = string . strip ( ) . replace ( " \n " , " " ) . replace ( " \r " , " " )
# replace twitter
mentionFinder = re . compile ( r " @[a-z0-9_] { 1,15} " , re . IGNORECASE )
string = mentionFinder . sub ( " MENTION " , string )
# replace emails
emailFinder = re . compile ( r " \ b[A-Z0-9._ % +-]+@[A-Z0-9.-]+ \ .[A-Z] { 2,} \ b " , re . IGNORECASE )
string = emailFinder . sub ( " EMAIL " , string )
# replace urls
urlFinder = re . compile ( r " ^(?:https?: \ / \ /)?(?:www \ .)?[a-zA-Z0-9./]+$ " , re . IGNORECASE )
string = urlFinder . sub ( " URL " , string )
# replace HTML symbols
string = string . replace ( " & " , " and " ) . replace ( " > " , " > " ) . replace ( " < " , " < " )
# parse with spaCy
spacy_doc = PARSER ( string )
tokens = [ ]
added_entities = [ " WORK_OF_ART " , " ORG " , " PRODUCT " , " LOC " ] #,"PERSON"]
added_POS = [ " NOUN " , " NUM " ] #,"VERB","ADJ"] #IDEE NUM mit in den Corpus aufnehmen, aber fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013
# append Tokens to a list
for tok in spacy_doc :
if tok . pos_ in added_POS :
if lemmatize :
tokens . append ( tok . lemma_ . lower ( ) . strip ( ) )
else :
tokens . append ( tok . text . lower ( ) . strip ( ) )
# add entities
if tok . ent_type_ in added_entities :
tokens . append ( tok . text . lower ( ) )
# remove stopwords
tokens = [ tok for tok in tokens if tok not in stoplist ]
# remove symbols
tokens = [ tok for tok in tokens if tok not in symbols ]
# remove custom_words
tokens = [ tok for tok in tokens if tok not in custom_words ]
# remove single characters
tokens = [ tok for tok in tokens if len ( tok ) > 1 ]
# remove large strings of whitespace
while " " in tokens :
tokens . remove ( " " )
while " " in tokens :
tokens . remove ( " " )
while " \n " in tokens :
tokens . remove ( " \n " )
while " \n \n " in tokens :
tokens . remove ( " \n \n " )
2017-08-31 10:38:29 +02:00
#TODO abkürzungen auflösen (v.a. TU -> Technische Universität)
2017-08-29 15:01:17 +02:00
2017-08-31 10:38:29 +02:00
if normalize_synonyms :
tokens = [ str ( getFirstSynonym ( tok , THESAURUS_list ) ) for tok in tokens ]
return " " . join ( tokens )
2017-08-29 15:01:17 +02:00
2017-08-30 12:56:59 +02:00
def generateTextfromXML ( path2xml , clean = True , textfield = ' Beschreibung ' ) :
2017-08-29 15:01:17 +02:00
import xml . etree . ElementTree as ET
tree = ET . parse ( path2xml , ET . XMLParser ( encoding = " utf-8 " ) )
root = tree . getroot ( )
2017-08-30 12:56:59 +02:00
for subject in root . iter ( textfield ) :
2017-08-29 15:01:17 +02:00
if clean :
yield cleanText ( subject . text )
else :
yield subject . text
def generateMetadatafromXML ( path2xml , keys = [ " Loesung " , " Kategorie " , " Zusammenfassung " ] ) :
import xml . etree . ElementTree as ET
tree = ET . parse ( path2xml , ET . XMLParser ( encoding = " utf-8 " ) )
root = tree . getroot ( )
metadata = dict . fromkeys ( keys )
2017-08-30 12:56:59 +02:00
2017-08-29 15:01:17 +02:00
for ticket in root . findall ( ' ticket ' ) :
for key in metadata :
2017-08-30 12:56:59 +02:00
metadata [ key ] = ticket . find ( key ) . text
2017-08-29 15:01:17 +02:00
yield metadata
2017-08-31 10:38:29 +02:00
def generateFromXML ( path2xml , textfield = ' Beschreibung ' , clean = False , normalize_Synonyms = False ) :
2017-08-30 12:56:59 +02:00
import xml . etree . ElementTree as ET
tree = ET . parse ( path2xml , ET . XMLParser ( encoding = " utf-8 " ) )
root = tree . getroot ( )
2017-08-29 15:01:17 +02:00
2017-08-30 12:56:59 +02:00
for ticket in root :
metadata = { }
text = " ERROR "
for field in ticket :
if field . tag == textfield :
if clean :
2017-08-31 10:38:29 +02:00
text = cleanText ( field . text , normalize_synonyms = normalize_Synonyms , lemmatize = False )
2017-08-30 12:56:59 +02:00
else :
text = field . text
else :
2017-08-31 10:38:29 +02:00
#todo hier auch cleanen?
2017-08-30 12:56:59 +02:00
metadata [ field . tag ] = field . text
yield text , metadata
2017-08-29 15:01:17 +02:00
2017-08-31 10:38:29 +02:00
def getFirstSynonym ( word , thesaurus_gen ) :
word = word . lower ( )
2017-08-29 15:01:17 +02:00
2017-08-31 10:38:29 +02:00
# durch den thesaurrus iterieren
for syn_block in thesaurus_gen : # syn_block ist eine liste mit Synonymen
for syn in syn_block :
syn = syn . lower ( )
if re . match ( r ' \ A[ \ w-]+ \ Z ' , syn ) : # falls syn einzelwort ist
if word == syn :
return getHauptform ( syn_block , word )
else : # falls es ein satz ist
if word in syn :
return getHauptform ( syn_block , word )
return word # zur Not, das ursrpüngliche Wort zurückgeben
2017-08-29 15:01:17 +02:00
2017-08-31 10:38:29 +02:00
def getHauptform ( syn_block , word , default_return_first_Syn = False ) :
for syn in syn_block :
syn = syn . lower ( )
if " hauptform " in syn and len ( syn . split ( " " ) ) < = 2 :
# nicht ausgeben, falls es in Klammern steht
for w in syn . split ( " " ) :
if not re . match ( r ' \ ([^)]+ \ ) ' , w ) :
return w
if default_return_first_Syn :
# falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
for w in syn_block :
if not re . match ( r ' \ ([^)]+ \ ) ' , w ) :
return w
return word # zur Not, das ursrpüngliche Wort zurückgeben
####################'####################'####################'####################'####################'##############
import de_core_news_md
2017-08-29 15:01:17 +02:00
DATAPATH = " ticketSamples.xml "
DATAPATH_thesaurus = " openthesaurus.csv "
LANGUAGE = ' de '
####################'####################'####################'####################'####################'##############
2017-08-31 10:38:29 +02:00
PARSER = de_core_news_md . load ( ) #spacy.load(LANGUAGE)
THESAURUS_list = list ( textacy . fileio . read_csv ( DATAPATH_thesaurus , delimiter = " ; " ) ) ## !!!!!! list wichtig, da sonst nicht die gleichen Synonyme zurückgegeben werden, weil der generator während der laufzeit pickt
2017-08-29 15:01:17 +02:00
## files to textacy-corpus
textacyCorpus = textacy . Corpus ( PARSER )
print ( " add texts to textacy-corpus... " )
2017-08-30 12:56:59 +02:00
#textacyCorpus.add_texts(texts=generateTextfromXML(DATAPATH), metadatas=generateMetadatafromXML(DATAPATH))
2017-08-31 10:38:29 +02:00
for txt , dic in generateFromXML ( DATAPATH , normalize_Synonyms = True , clean = True ) :
2017-08-30 12:56:59 +02:00
textacyCorpus . add_text ( txt , dic )
2017-08-29 15:01:17 +02:00
2017-08-31 10:38:29 +02:00
for doc in textacyCorpus :
print ( doc . text )
2017-08-29 15:01:17 +02:00
2017-08-31 10:38:29 +02:00
#print(textacyCorpus[2].text)
2017-08-30 12:56:59 +02:00
#printRandomDoc(textacyCorpus)
#print(textacyCorpus[len(textacyCorpus)-1].text)
2017-08-29 15:01:17 +02:00
2017-08-31 10:38:29 +02:00
print ( )
print ( )
2017-08-29 15:01:17 +02:00
2017-08-31 10:38:29 +02:00
#################### 1
2017-08-29 15:01:17 +02:00
2017-08-31 10:38:29 +02:00
PARSER = de_core_news_md . load ( ) #spacy.load(LANGUAGE)
2017-08-29 15:01:17 +02:00
2017-08-31 10:38:29 +02:00
## files to textacy-corpus
textacyCorpus = textacy . Corpus ( PARSER )
2017-08-29 15:01:17 +02:00
2017-08-31 10:38:29 +02:00
for txt , dic in generateFromXML ( DATAPATH , normalize_Synonyms = False , clean = True ) :
textacyCorpus . add_text ( txt , dic )
2017-08-29 15:01:17 +02:00
2017-08-31 10:38:29 +02:00
for doc in textacyCorpus :
print ( doc . text )
2017-08-29 15:01:17 +02:00
2017-08-31 10:38:29 +02:00
print ( )
print ( )
2017-08-29 15:01:17 +02:00