topicModelingTickets/preprocessing.py

# -*- coding: utf-8 -*-
import csv
import random
import re

import spacy
import textacy
import sys

import xml.etree.ElementTree as ET
"""
import keras
import numpy as np
from keras.layers import Dense, SimpleRNN, LSTM, TimeDistributed, Dropout
from keras.models import Sequential
import keras.backend as K
"""
csv.field_size_limit(sys.maxsize)


def printRandomDoc(textacyCorpus):
    print()

    print("len(textacyCorpus) = %i" % len(textacyCorpus))
    randIndex = int((len(textacyCorpus) - 1) * random.random())
    print("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata))

    print()

"""
def getFirstSynonym(word, thesaurus_gen):

        word = word.lower()
        # TODO word cleaning https://stackoverflow.com/questions/3939361/remove-specific-characters-from-a-string-in-python


        # durch den thesaurrus iterieren
        for syn_block in thesaurus_gen:  # syn_block ist eine liste mit Synonymen

            # durch den synonymblock iterieren
            for syn in syn_block:
                syn = syn.lower().split(" ") if not re.match(r'\A[\w-]+\Z', syn) else syn # aus synonym mach liste (um evtl. sätze zu identifieziren)

                # falls das wort in dem synonym enthalten ist (also == einem Wort in der liste ist)
                if word in syn:

                    # Hauptform suchen
                    if "auptform" in syn:
                        # nicht ausgeben, falls es in Klammern steht
                        for w in syn:
                            if not re.match(r'\([^)]+\)', w) and w is not None:
                                return w

                    # falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
                    if len(syn) == 1:
                        w = syn[0]
                        if not re.match(r'\([^)]+\)', w) and w is not None:
                            return w

        return word # zur Not die eingabe ausgeben
"""

def cleanText(string,custom_stopwords=None, custom_symbols=None, custom_words=None, customPreprocessing=None, lemmatize=False, normalize_synonyms=False):

    # use preprocessing
    if customPreprocessing is not None:
        string = customPreprocessing(string)


    if custom_stopwords is not None:
        custom_stopwords = custom_stopwords
    else:
        custom_stopwords = []

    if custom_words is not None:
        custom_words = custom_words
    else:
        custom_words = []

    if custom_symbols is not None:
        custom_symbols = custom_symbols
    else:
        custom_symbols = []


    # custom stoplist
    # https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import
    stop_words = __import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS

    stoplist =list(stop_words) + custom_stopwords
    # List of symbols we don't care about either
    symbols = ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] + custom_symbols


    # get rid of newlines
    string = string.strip().replace("\n", " ").replace("\r", " ")

    # replace twitter
    mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
    string = mentionFinder.sub("MENTION", string)

    # replace emails
    emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
    string = emailFinder.sub("EMAIL", string)

    # replace urls
    urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
    string = urlFinder.sub("URL", string)

    # replace HTML symbols
    string = string.replace("&amp;", "and").replace("&gt;", ">").replace("&lt;", "<")


    # parse with spaCy
    spacy_doc = PARSER(string)
    tokens = []

    added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"]
    added_POS = ["NOUN", "NUM" ]#,"VERB","ADJ"] #IDEE NUM mit in den Corpus aufnehmen, aber fürs TopicModeling nur Nomen  http://aclweb.org/anthology/U15-1013

    # append Tokens to a list
    for tok in spacy_doc:
            if tok.pos_ in added_POS:
                if lemmatize:
                    tokens.append(tok.lemma_.lower().strip())
                else:
                    tokens.append(tok.text.lower().strip())

            # add entities
            if tok.ent_type_ in added_entities:
                tokens.append(tok.text.lower())


    # remove stopwords
    tokens = [tok for tok in tokens if tok not in stoplist]

    # remove symbols
    tokens = [tok for tok in tokens if tok not in symbols]

    # remove custom_words
    tokens = [tok for tok in tokens if tok not in custom_words]

    # remove single characters
    tokens = [tok for tok in tokens if len(tok)>1]

    # remove large strings of whitespace
    while "" in tokens:
        tokens.remove("")
    while " " in tokens:
        tokens.remove(" ")
    while "\n" in tokens:
        tokens.remove("\n")
    while "\n\n" in tokens:
        tokens.remove("\n\n")

    #TODO abkürzungen auflösen (v.a. TU -> Technische Universität)

    if normalize_synonyms:
        tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens]

    return " ".join(tokens)

def generateTextfromXML(path2xml, clean=True, textfield='Beschreibung'):
    import xml.etree.ElementTree as ET

    tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))

    root = tree.getroot()

    for subject in root.iter(textfield):
        if clean:
            yield cleanText(subject.text)
        else:
            yield subject.text

def generateMetadatafromXML(path2xml, keys=["Loesung","Kategorie","Zusammenfassung"]):
    import xml.etree.ElementTree as ET

    tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))

    root = tree.getroot()

    metadata = dict.fromkeys(keys)


    for ticket in root.findall('ticket'):
        for key in metadata:
            metadata[key] = ticket.find(key).text

        yield metadata

def generateFromXML(path2xml,  textfield='Beschreibung', clean=False, normalize_Synonyms=False):
    import xml.etree.ElementTree as ET

    tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
    root = tree.getroot()

    for ticket in root:
        metadata = {}
        text = "ERROR"
        for field in ticket:
            if field.tag == textfield:
                if clean:
                    text = cleanText(field.text,normalize_synonyms=normalize_Synonyms,lemmatize=False)
                else:
                    text = field.text
            else:
                #todo hier auch cleanen?
                metadata[field.tag] = field.text
        yield text, metadata

def getFirstSynonym(word, thesaurus_gen):

    word = word.lower()


    # durch den thesaurrus iterieren
    for syn_block in thesaurus_gen:  # syn_block ist eine liste mit Synonymen

        for syn in syn_block:
            syn = syn.lower()
            if re.match(r'\A[\w-]+\Z', syn):  # falls syn einzelwort ist
                if word == syn:
                    return getHauptform(syn_block, word)
            else:  # falls es ein satz ist
                if word in syn:
                    return getHauptform(syn_block, word)
    return word  # zur Not, das ursrpüngliche Wort zurückgeben


def getHauptform(syn_block, word, default_return_first_Syn=False):

    for syn in syn_block:
        syn = syn.lower()

        if "hauptform" in syn and len(syn.split(" ")) <= 2:
            # nicht ausgeben, falls es in Klammern steht
            for w in syn.split(" "):
                if not re.match(r'\([^)]+\)', w):
                    return w

    if default_return_first_Syn:
        # falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
        for w in syn_block:
            if not re.match(r'\([^)]+\)', w):
                return w
    return word  # zur Not, das ursrpüngliche Wort zurückgeben


####################'####################'####################'####################'####################'##############

import de_core_news_md
DATAPATH = "ticketSamples.xml"
DATAPATH_thesaurus = "openthesaurus.csv"

LANGUAGE = 'de'


####################'####################'####################'####################'####################'##############

PARSER = de_core_news_md.load()#spacy.load(LANGUAGE)

THESAURUS_list=list(textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";"))  ## !!!!!! list wichtig, da sonst nicht die gleichen Synonyme zurückgegeben werden, weil der generator während der laufzeit pickt


## files to textacy-corpus
textacyCorpus = textacy.Corpus(PARSER)

print("add texts to textacy-corpus...")
#textacyCorpus.add_texts(texts=generateTextfromXML(DATAPATH), metadatas=generateMetadatafromXML(DATAPATH))
for txt, dic in generateFromXML(DATAPATH,normalize_Synonyms=True,clean=True):
    textacyCorpus.add_text(txt,dic)


for doc in textacyCorpus:
    print(doc.text)

#print(textacyCorpus[2].text)
#printRandomDoc(textacyCorpus)
#print(textacyCorpus[len(textacyCorpus)-1].text)


print()
print()

#################### 1

PARSER = de_core_news_md.load()#spacy.load(LANGUAGE)

## files to textacy-corpus
textacyCorpus = textacy.Corpus(PARSER)

for txt, dic in generateFromXML(DATAPATH, normalize_Synonyms=False, clean=True):
    textacyCorpus.add_text(txt,dic)


for doc in textacyCorpus:
    print(doc.text)


print()
print()
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
+								# -*- coding: utf-8 -*-
 								import csv
 								import random
 								import re
 								import spacy
 								import textacy
 								import sys
-												openthesaurus debugging

											
										
										
											2017-08-30 12:56:59 +02:00
+								import xml.etree.ElementTree as ET
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
+								"""
 								import keras
 								import numpy as np
 								from keras.layers import Dense, SimpleRNN, LSTM, TimeDistributed, Dropout
 								from keras.models import Sequential
 								import keras.backend as K
 								"""
 								csv.field_size_limit(sys.maxsize)
 								def printRandomDoc(textacyCorpus):
 								    print()
-												openthesaurus refactored.

											
										
										
											2017-08-31 10:38:29 +02:00
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
+								    print("len(textacyCorpus) = %i" % len(textacyCorpus))
 								    randIndex = int((len(textacyCorpus) - 1) * random.random())
 								    print("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata))
-												openthesaurus refactored.

											
										
										
											2017-08-31 10:38:29 +02:00
+								    print()
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
-												openthesaurus refactored.

											
										
										
											2017-08-31 10:38:29 +02:00
+								"""
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
+								def getFirstSynonym(word, thesaurus_gen):
 								        word = word.lower()
 								        # TODO word cleaning https://stackoverflow.com/questions/3939361/remove-specific-characters-from-a-string-in-python
 								        # durch den thesaurrus iterieren
 								        for syn_block in thesaurus_gen:  # syn_block ist eine liste mit Synonymen
 								            # durch den synonymblock iterieren
 								            for syn in syn_block:
-												openthesaurus debugging

											
										
										
											2017-08-30 12:56:59 +02:00
+								                syn = syn.lower().split(" ") if not re.match(r'\A[\w-]+\Z', syn) else syn # aus synonym mach liste (um evtl. sätze zu identifieziren)
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
 								                # falls das wort in dem synonym enthalten ist (also == einem Wort in der liste ist)
 								                if word in syn:
 								                    # Hauptform suchen
 								                    if "auptform" in syn:
 								                        # nicht ausgeben, falls es in Klammern steht
 								                        for w in syn:
-												openthesaurus debugging

											
										
										
											2017-08-30 12:56:59 +02:00
+								                            if not re.match(r'\([^)]+\)', w) and w is not None:
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
+								                                return w
 								                    # falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
 								                    if len(syn) == 1:
 								                        w = syn[0]
-												openthesaurus debugging

											
										
										
											2017-08-30 12:56:59 +02:00
+								                        if not re.match(r'\([^)]+\)', w) and w is not None:
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
+								                            return w
-												openthesaurus debugging

											
										
										
											2017-08-30 12:56:59 +02:00
+								        return word # zur Not die eingabe ausgeben
-												openthesaurus refactored.

											
										
										
											2017-08-31 10:38:29 +02:00
+								"""
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
-												openthesaurus refactored.

											
										
										
											2017-08-31 10:38:29 +02:00
+								def cleanText(string,custom_stopwords=None, custom_symbols=None, custom_words=None, customPreprocessing=None, lemmatize=False, normalize_synonyms=False):
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
 								    # use preprocessing
 								    if customPreprocessing is not None:
 								        string = customPreprocessing(string)
 								    if custom_stopwords is not None:
 								        custom_stopwords = custom_stopwords
 								    else:
 								        custom_stopwords = []
 								    if custom_words is not None:
 								        custom_words = custom_words
 								    else:
 								        custom_words = []
 								    if custom_symbols is not None:
 								        custom_symbols = custom_symbols
 								    else:
 								        custom_symbols = []
 								    # custom stoplist
 								    # https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import
 								    stop_words = __import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS
 								    stoplist =list(stop_words) + custom_stopwords
 								    # List of symbols we don't care about either
 								    symbols = ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] + custom_symbols
 								    # get rid of newlines
 								    string = string.strip().replace("\n", " ").replace("\r", " ")
 								    # replace twitter
 								    mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
 								    string = mentionFinder.sub("MENTION", string)
 								    # replace emails
 								    emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
 								    string = emailFinder.sub("EMAIL", string)
 								    # replace urls
 								    urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
 								    string = urlFinder.sub("URL", string)
 								    # replace HTML symbols
 								    string = string.replace("&amp;", "and").replace("&gt;", ">").replace("&lt;", "<")
 								    # parse with spaCy
 								    spacy_doc = PARSER(string)
 								    tokens = []
 								    added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"]
 								    added_POS = ["NOUN", "NUM" ]#,"VERB","ADJ"] #IDEE NUM mit in den Corpus aufnehmen, aber fürs TopicModeling nur Nomen  http://aclweb.org/anthology/U15-1013
 								    # append Tokens to a list
 								    for tok in spacy_doc:
 								            if tok.pos_ in added_POS:
 								                if lemmatize:
 								                    tokens.append(tok.lemma_.lower().strip())
 								                else:
 								                    tokens.append(tok.text.lower().strip())
 								            # add entities
 								            if tok.ent_type_ in added_entities:
 								                tokens.append(tok.text.lower())
 								    # remove stopwords
 								    tokens = [tok for tok in tokens if tok not in stoplist]
 								    # remove symbols
 								    tokens = [tok for tok in tokens if tok not in symbols]
 								    # remove custom_words
 								    tokens = [tok for tok in tokens if tok not in custom_words]
 								    # remove single characters
 								    tokens = [tok for tok in tokens if len(tok)>1]
 								    # remove large strings of whitespace
 								    while "" in tokens:
 								        tokens.remove("")
 								    while " " in tokens:
 								        tokens.remove(" ")
 								    while "\n" in tokens:
 								        tokens.remove("\n")
 								    while "\n\n" in tokens:
 								        tokens.remove("\n\n")
-												openthesaurus refactored.

											
										
										
											2017-08-31 10:38:29 +02:00
+								    #TODO abkürzungen auflösen (v.a. TU -> Technische Universität)
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
-												openthesaurus refactored.

											
										
										
											2017-08-31 10:38:29 +02:00
+								    if normalize_synonyms:
 								        tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens]
 								    return " ".join(tokens)
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
-												openthesaurus debugging

											
										
										
											2017-08-30 12:56:59 +02:00
+								def generateTextfromXML(path2xml, clean=True, textfield='Beschreibung'):
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
+								    import xml.etree.ElementTree as ET
 								    tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
 								    root = tree.getroot()
-												openthesaurus debugging

											
										
										
											2017-08-30 12:56:59 +02:00
+								    for subject in root.iter(textfield):
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
+								        if clean:
 								            yield cleanText(subject.text)
 								        else:
 								            yield subject.text
 								def generateMetadatafromXML(path2xml, keys=["Loesung","Kategorie","Zusammenfassung"]):
 								    import xml.etree.ElementTree as ET
 								    tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
 								    root = tree.getroot()
 								    metadata = dict.fromkeys(keys)
-												openthesaurus debugging

											
										
										
											2017-08-30 12:56:59 +02:00
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
+								    for ticket in root.findall('ticket'):
 								        for key in metadata:
-												openthesaurus debugging

											
										
										
											2017-08-30 12:56:59 +02:00
+								            metadata[key] = ticket.find(key).text
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
 								        yield metadata
-												openthesaurus refactored.

											
										
										
											2017-08-31 10:38:29 +02:00
+								def generateFromXML(path2xml,  textfield='Beschreibung', clean=False, normalize_Synonyms=False):
-												openthesaurus debugging

											
										
										
											2017-08-30 12:56:59 +02:00
+								    import xml.etree.ElementTree as ET
 								    tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
 								    root = tree.getroot()
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
-												openthesaurus debugging

											
										
										
											2017-08-30 12:56:59 +02:00
+								    for ticket in root:
 								        metadata = {}
 								        text = "ERROR"
 								        for field in ticket:
 								            if field.tag == textfield:
 								                if clean:
-												openthesaurus refactored.

											
										
										
											2017-08-31 10:38:29 +02:00
+								                    text = cleanText(field.text,normalize_synonyms=normalize_Synonyms,lemmatize=False)
-												openthesaurus debugging

											
										
										
											2017-08-30 12:56:59 +02:00
+								                else:
 								                    text = field.text
 								            else:
-												openthesaurus refactored.

											
										
										
											2017-08-31 10:38:29 +02:00
+								                #todo hier auch cleanen?
-												openthesaurus debugging

											
										
										
											2017-08-30 12:56:59 +02:00
+								                metadata[field.tag] = field.text
 								        yield text, metadata
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
-												openthesaurus refactored.

											
										
										
											2017-08-31 10:38:29 +02:00
+								def getFirstSynonym(word, thesaurus_gen):
 								    word = word.lower()
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
-												openthesaurus refactored.

											
										
										
											2017-08-31 10:38:29 +02:00
 								    # durch den thesaurrus iterieren
 								    for syn_block in thesaurus_gen:  # syn_block ist eine liste mit Synonymen
 								        for syn in syn_block:
 								            syn = syn.lower()
 								            if re.match(r'\A[\w-]+\Z', syn):  # falls syn einzelwort ist
 								                if word == syn:
 								                    return getHauptform(syn_block, word)
 								            else:  # falls es ein satz ist
 								                if word in syn:
 								                    return getHauptform(syn_block, word)
 								    return word  # zur Not, das ursrpüngliche Wort zurückgeben
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
-												openthesaurus refactored.

											
										
										
											2017-08-31 10:38:29 +02:00
+								def getHauptform(syn_block, word, default_return_first_Syn=False):
 								    for syn in syn_block:
 								        syn = syn.lower()
 								        if "hauptform" in syn and len(syn.split(" ")) <= 2:
 								            # nicht ausgeben, falls es in Klammern steht
 								            for w in syn.split(" "):
 								                if not re.match(r'\([^)]+\)', w):
 								                    return w
 								    if default_return_first_Syn:
 								        # falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
 								        for w in syn_block:
 								            if not re.match(r'\([^)]+\)', w):
 								                return w
 								    return word  # zur Not, das ursrpüngliche Wort zurückgeben
 								####################'####################'####################'####################'####################'##############
 								import de_core_news_md
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
+								DATAPATH = "ticketSamples.xml"
 								DATAPATH_thesaurus = "openthesaurus.csv"
 								LANGUAGE = 'de'
 								####################'####################'####################'####################'####################'##############
-												openthesaurus refactored.

											
										
										
											2017-08-31 10:38:29 +02:00
+								PARSER = de_core_news_md.load()#spacy.load(LANGUAGE)
 								THESAURUS_list=list(textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";"))  ## !!!!!! list wichtig, da sonst nicht die gleichen Synonyme zurückgegeben werden, weil der generator während der laufzeit pickt
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
 								## files to textacy-corpus
 								textacyCorpus = textacy.Corpus(PARSER)
 								print("add texts to textacy-corpus...")
-												openthesaurus debugging

											
										
										
											2017-08-30 12:56:59 +02:00
+								#textacyCorpus.add_texts(texts=generateTextfromXML(DATAPATH), metadatas=generateMetadatafromXML(DATAPATH))
-												openthesaurus refactored.

											
										
										
											2017-08-31 10:38:29 +02:00
+								for txt, dic in generateFromXML(DATAPATH,normalize_Synonyms=True,clean=True):
-												openthesaurus debugging

											
										
										
											2017-08-30 12:56:59 +02:00
+								    textacyCorpus.add_text(txt,dic)
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
-												openthesaurus refactored.

											
										
										
											2017-08-31 10:38:29 +02:00
+								for doc in textacyCorpus:
 								    print(doc.text)
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
-												openthesaurus refactored.

											
										
										
											2017-08-31 10:38:29 +02:00
+								#print(textacyCorpus[2].text)
-												openthesaurus debugging

											
										
										
											2017-08-30 12:56:59 +02:00
+								#printRandomDoc(textacyCorpus)
 								#print(textacyCorpus[len(textacyCorpus)-1].text)
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
-												openthesaurus refactored.

											
										
										
											2017-08-31 10:38:29 +02:00
+								print()
 								print()
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
-												openthesaurus refactored.

											
										
										
											2017-08-31 10:38:29 +02:00
+								#################### 1
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
-												openthesaurus refactored.

											
										
										
											2017-08-31 10:38:29 +02:00
+								PARSER = de_core_news_md.load()#spacy.load(LANGUAGE)
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
-												openthesaurus refactored.

											
										
										
											2017-08-31 10:38:29 +02:00
+								## files to textacy-corpus
 								textacyCorpus = textacy.Corpus(PARSER)
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
-												openthesaurus refactored.

											
										
										
											2017-08-31 10:38:29 +02:00
+								for txt, dic in generateFromXML(DATAPATH, normalize_Synonyms=False, clean=True):
 								    textacyCorpus.add_text(txt,dic)
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
-												openthesaurus refactored.

											
										
										
											2017-08-31 10:38:29 +02:00
+								for doc in textacyCorpus:
 								    print(doc.text)
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
-												openthesaurus refactored.

											
										
										
											2017-08-31 10:38:29 +02:00
+								print()
 								print()
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00