topicModelingTickets/testra.py

# -*- coding: utf-8 -*-
import re
import time
import json

import spacy
import textacy

start = time.time()

import enchant

from datetime import datetime

import xml.etree.ElementTree as ET

print(datetime.now())

"""
PARSER=spacy.load("de")


corpus = textacy.Corpus(PARSER)

testcontetn = [
    "fdsfdsfsd",
    "juzdtjlkö",
    "gfadojplk"
]

testmetda = [
    {"categoryName":"zhb","Solution":"","Subject":"schulungstest"},
    {"categoryName":"neuanschluss","Solution":"subject","Subject":"telephone contract"},
    {"categoryName":"zhb","Solution":"","Subject":"setuji"}
]


def makecontent(testcontetn):
    for content in testcontetn:
        yield content


def makemeta( testmetda):
    for  metdata in  testmetda:
        yield  metdata


corpus.add_texts(
    makecontent(testcontetn),
    makemeta(testmetda)
)

print(corpus)
"""


from postal.parser import parse_address


address = "Nicolas Rauner LS Biomaterialien und Polymerwissenschaften Fakultät Bio- und Chemieingenieurwesen TU Dortmund D-44227 Dortmund Tel: + 49-(0)231 / 755 - 3015 Fax: + 49-(0)231 / 755 - 2480"
print(parse_address(address))


address = "Technische Universität Dortmund Maschinenbau/Lehrstuhl für Förder- und Lagerwesen LogistikCampus Joseph-von-Fraunhofer-Str. 2-4 D-44227 Dortmund "
print(parse_address(address))


"""

corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
corpus_name = "testcorpus"


#corpus.save(corpus_path, name=corpus_name, compression=corpus_compression)
#corpus =  textacy.Corpus.load(corpus_path, name=corpus_name, compression=corpus_compression)


import pathlib

strings_path = pathlib.Path(corpus_path + 'strings.json')
path_lexemes_bin_ = pathlib.Path(corpus_path + 'lexemes.bin')

PARSER.vocab.dump(path_lexemes_bin_)
nlp.vocab.load_lexemes(path_lexemes_bin_)


def save_corpus(corpus_path,corpus_name):

    # save stringstore
    stringstore_path = corpus_path + corpus_name + '_strings.json'
    with open(stringstore_path, "w") as file:
        PARSER.vocab.strings.dump(file)


    #save content
    contentpath = corpus_path + corpus_name+ "_content.bin"
    textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus),contentpath)


    #save meta
    metapath = corpus_path + corpus_name +"_meta.json"
    textacy.fileio.write_json_lines((doc.metadata for doc in corpus), metapath)


def load_corpus(corpus_path,corpus_name):
    # load new lang
    nlp = spacy.load("de")

    #load stringstore
    stringstore_path = corpus_path + corpus_name + '_strings.json'
    with open(stringstore_path,"r") as file:
        nlp.vocab.strings.load(file)

    # define corpus
    corpus = textacy.Corpus(nlp)

    # load meta
    metapath = corpus_path + corpus_name +"_meta.json"
    metadata_stream = textacy.fileio.read_json_lines(metapath)

    #load content
    contentpath = corpus_path + corpus_name+ "_content.bin"
    spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)

    for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
        corpus.add_doc(
            textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))

    return corpus


save_corpus(corpus_path,corpus_name)

print(load_corpus(corpus_path,corpus_name))

"""

"""
def normalizeSynonyms(default_return_first_Syn=False, parser=PARSER):
    #return lambda doc : parser(" ".join([tok.lower_ for tok in doc]))
    return lambda doc : parser(" ".join([getFirstSynonym(tok.lower_, THESAURUS, default_return_first_Syn=default_return_first_Syn) for tok in doc]))

def getFirstSynonym(word, thesaurus, default_return_first_Syn=False):
    if not isinstance(word, str):
        return str(word)

    word = word.lower()

    # durch den thesaurrus iterieren
    for syn_block in thesaurus:  # syn_block ist eine liste mit Synonymen

        for syn in syn_block:
            syn = syn.lower()
            if re.match(r'\A[\w-]+\Z', syn):  # falls syn einzelwort ist
                if word == syn:
                    return str(getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn))
            else:  # falls es ein satz ist
                if word in syn:
                    return str(getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn))
    return str(word)  # zur Not, das ursrpüngliche Wort zurückgeben

def getHauptform(syn_block, word, default_return_first_Syn=False):
    for syn in syn_block:
        syn = syn.lower()

        if "hauptform" in syn and len(syn.split(" ")) <= 2:
            # nicht ausgeben, falls es in Klammern steht#todo gibts macnmal?? klammern aus
            for w in syn.split(" "):
                if not re.match(r'\([^)]+\)', w):
                    return w

    if default_return_first_Syn:
        # falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
        for w in syn_block:
            if not re.match(r'\([^)]+\)', w):
                return w
    return word  # zur Not, das ursrpüngliche Wort zurückgeben
"""

"""
path2xml="/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deWordNet.xml"

tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
root = tree.getroot()

for r in root:
    for element in r:

            if element.tag == "Synset":
                attrib = element.attrib
                for i,subentry in enumerate(element):
                    if subentry.tag == "Lemma" and subentry.attrib["partOfSpeech"] == "n":
                            string = (subentry.attrib["writtenForm"])
                            # replaceRockDots
                            string = re.sub(r'[ß]', "ss", string)
                            string = re.sub(r'[ö]', "oe", string)
                            string = re.sub(r'[ü]', "ue", string)
                            string = re.sub(r'[ä]', "ae", string)

                            # seperate_words_on_regex:
                            string = " ".join(re.compile(regex_specialChars).split(string))
                            string_list=string.split()
                            if len(string_list) == 1:
                                nomen.append(string.lower().strip())
"""

"""
import re
from collections import Counter

def words(text): return re.findall(r'\w+', text.lower())

WORDS = Counter(words(open('/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt').read()))

def P(word, N=sum(WORDS.values())):
    "Probability of `word`."
    return WORDS[word] / N

def correction(word):
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word):
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words):
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word):
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

"""

"""
### extract from derewo

#http://www1.ids-mannheim.de/kl/projekte/methoden/derewo.html


raw = textacy.fileio.read_file_lines("DeReKo-2014-II-MainArchive-STT.100000.freq")

for line in raw:
    line_list=line.split()
    if line_list[2] == "NN":
        string = line_list[1].lower()

        # replaceRockDots
        string = re.sub(r'[ß]', "ss", string)
        string = re.sub(r'[ö]', "oe", string)
        string = re.sub(r'[ü]', "ue", string)
        string = re.sub(r'[ä]', "ae", string)


        nomen.append(string.lower().strip())


textacy.fileio.write_file_lines(nomen,"nomen2.txt")
"""

"""
stream = textacy.fileio.read_csv("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_2017-09-13.csv", delimiter=";")
content_collumn_name = "Description"
content_collumn = 9  # standardvalue

de_tickets=[]
en_tickets=[]
misc_tickets=[]

error_count = 0
for i, lst in enumerate(stream):
    if i == 0:
        de_tickets.append(lst)
        en_tickets.append(lst)
        misc_tickets.append(lst)
    else:
        try:
            content_collumn_ = lst[content_collumn]
            if detect(content_collumn_) == "de":
                de_tickets.append(lst)
            elif detect(content_collumn_) == "en":
                en_tickets.append(lst)
            else:
                misc_tickets.append(lst)

        except:
            misc_tickets.append(lst)
            error_count += 1

print(error_count)

textacy.fileio.write_csv(de_tickets,"M42-Export/de_tickets.csv", delimiter=";")
textacy.fileio.write_csv(en_tickets,"M42-Export/en_tickets.csv", delimiter=";")
textacy.fileio.write_csv(misc_tickets,"M42-Export/misc_tickets.csv", delimiter=";")


"""

"""
regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|</>?]'


def stringcleaning(stringstream, funclist):
    for string in stringstream:
        for f in funclist:
            string = f(string)
        yield string


def seperate_words_on_regex(regex=regex_specialChars):
    return lambda string: " ".join(re.compile(regex).split(string))


words = [
    "uniaccount",
    "nr54065467",
    "nr54065467",
    "455a33c5,"
    "tvt?=",
    "tanja.saborowski@tu-dortmund.de",
    "-",
    "m-sw1-vl4053.itmc.tu-dortmund.de",
    "------problem--------"
]


topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)

for s in stringcleaning((w for w in words),[seperate_words_on_regex()]):
    print(s.strip())

    #print(stringcleaning(w,string_comp))
    #print(bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?',w)))
    #print(bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]',w)))
    #result = specialFinder.sub(" ", w)
    #print(re.sub(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]'," ",w))

    #print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w))
"""

"""
def replaceRockDots():
    return lambda string: re.sub(r'[ß]', "ss", (re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower())))))))


de_stop_words = list(textacy.fileio.read_file_lines(filepath="german_stopwords_full.txt"))


#blob = Text(str(textacy.fileio.read_file("teststring.txt")))#,parser=PatternParser(pprint=True, lemmata=True))

#print(blob.entities)

de_stop_words = list(map(replaceRockDots(),de_stop_words))
#LEMMAS = list(map(replaceRockDots(),LEMMAS))
#VORNAMEN = list(map(replaceRockDots(),VORNAMEN))

de_stop_words = list(map(textacy.preprocess.normalize_whitespace,de_stop_words))
#LEMMAS = list(map(textacy.preprocess.normalize_whitespace,LEMMAS))
#VORNAMEN = list(map(textacy.preprocess.normalize_whitespace,VORNAMEN))


#textacy.fileio.write_file_lines(LEMMAS,"lemmas.txt")
#textacy.fileio.write_file_lines(VORNAMEN,"firstnames.txt")
textacy.fileio.write_file_lines(de_stop_words,"german_stopwords.txt")

"""
end = time.time()
print("\n\n\nTime Elapsed Topic:{0}\n\n".format(end - start))
textcleaning macht immer noch keinen spass 2017-09-19 14:42:38 +02:00			`# -- coding: utf-8 --`
			`import re`
bereit für weitern testrun 2017-09-25 13:12:23 +02:00			`import time`
eigene corpus-save/load methoden geschreiben 2017-10-09 12:50:34 +02:00			`import json`
lemmatizer gebaut 2017-09-20 15:22:13 +02:00
thesaurus auf basis von deWornNet weitergemacht 2017-09-28 12:42:05 +02:00			`import spacy`
textcleaning macht immer noch keinen spass 2017-09-19 14:42:38 +02:00			`import textacy`

bereit für weitern testrun 2017-09-25 13:12:23 +02:00			`start = time.time()`

thesaurus fertiggestellt 2017-10-02 14:31:33 +02:00			`import enchant`
bereit für weitern testrun 2017-09-25 13:12:23 +02:00
			`from datetime import datetime`

			`import xml.etree.ElementTree as ET`

			`print(datetime.now())`

refactoring 2017-10-10 14:42:09 +02:00			`"""`
eigene corpus-save/load methoden geschreiben 2017-10-09 12:50:34 +02:00			`PARSER=spacy.load("de")`


			`corpus = textacy.Corpus(PARSER)`

			`testcontetn = [`
			`"fdsfdsfsd",`
			`"juzdtjlkö",`
			`"gfadojplk"`
			`]`

			`testmetda = [`
			`{"categoryName":"zhb","Solution":"","Subject":"schulungstest"},`
			`{"categoryName":"neuanschluss","Solution":"subject","Subject":"telephone contract"},`
			`{"categoryName":"zhb","Solution":"","Subject":"setuji"}`
			`]`


			`def makecontent(testcontetn):`
			`for content in testcontetn:`
			`yield content`


			`def makemeta( testmetda):`
			`for metdata in testmetda:`
			`yield metdata`


			`corpus.add_texts(`
			`makecontent(testcontetn),`
			`makemeta(testmetda)`
			`)`

			`print(corpus)`
refactoring 2017-10-10 14:42:09 +02:00			`"""`


			`from postal.parser import parse_address`


			`address = "Nicolas Rauner LS Biomaterialien und Polymerwissenschaften Fakultät Bio- und Chemieingenieurwesen TU Dortmund D-44227 Dortmund Tel: + 49-(0)231 / 755 - 3015 Fax: + 49-(0)231 / 755 - 2480"`
			`print(parse_address(address))`


			`address = "Technische Universität Dortmund Maschinenbau/Lehrstuhl für Förder- und Lagerwesen LogistikCampus Joseph-von-Fraunhofer-Str. 2-4 D-44227 Dortmund "`
			`print(parse_address(address))`

eigene corpus-save/load methoden geschreiben 2017-10-09 12:50:34 +02:00

refactoring 2017-10-10 14:42:09 +02:00




			`"""`

eigene corpus-save/load methoden geschreiben 2017-10-09 12:50:34 +02:00			`corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"`
			`corpus_name = "testcorpus"`

refactoring 2017-10-10 14:42:09 +02:00
eigene corpus-save/load methoden geschreiben 2017-10-09 12:50:34 +02:00			`#corpus.save(corpus_path, name=corpus_name, compression=corpus_compression)`
			`#corpus = textacy.Corpus.load(corpus_path, name=corpus_name, compression=corpus_compression)`



			`import pathlib`

			`strings_path = pathlib.Path(corpus_path + 'strings.json')`
			`path_lexemes_bin_ = pathlib.Path(corpus_path + 'lexemes.bin')`

			`PARSER.vocab.dump(path_lexemes_bin_)`
			`nlp.vocab.load_lexemes(path_lexemes_bin_)`
refactoring 2017-10-10 14:42:09 +02:00
eigene corpus-save/load methoden geschreiben 2017-10-09 12:50:34 +02:00
			`def save_corpus(corpus_path,corpus_name):`

			`# save stringstore`
			`stringstore_path = corpus_path + corpus_name + '_strings.json'`
			`with open(stringstore_path, "w") as file:`
			`PARSER.vocab.strings.dump(file)`


			`#save content`
			`contentpath = corpus_path + corpus_name+ "_content.bin"`
			`textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus),contentpath)`


			`#save meta`
			`metapath = corpus_path + corpus_name +"_meta.json"`
			`textacy.fileio.write_json_lines((doc.metadata for doc in corpus), metapath)`



			`def load_corpus(corpus_path,corpus_name):`
			`# load new lang`
			`nlp = spacy.load("de")`

			`#load stringstore`
			`stringstore_path = corpus_path + corpus_name + '_strings.json'`
			`with open(stringstore_path,"r") as file:`
			`nlp.vocab.strings.load(file)`

			`# define corpus`
			`corpus = textacy.Corpus(nlp)`

			`# load meta`
			`metapath = corpus_path + corpus_name +"_meta.json"`
			`metadata_stream = textacy.fileio.read_json_lines(metapath)`

			`#load content`
			`contentpath = corpus_path + corpus_name+ "_content.bin"`
			`spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)`

			`for spacy_doc, metadata in zip(spacy_docs, metadata_stream):`
			`corpus.add_doc(`
			`textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))`

			`return corpus`


			`save_corpus(corpus_path,corpus_name)`

			`print(load_corpus(corpus_path,corpus_name))`

refactoring 2017-10-10 14:42:09 +02:00			`"""`
bereit für weitern testrun 2017-09-25 13:12:23 +02:00
thesaurus auf basis von deWornNet weitergemacht 2017-09-28 12:42:05 +02:00			`"""`
			`def normalizeSynonyms(default_return_first_Syn=False, parser=PARSER):`
			`#return lambda doc : parser(" ".join([tok.lower_ for tok in doc]))`
			`return lambda doc : parser(" ".join([getFirstSynonym(tok.lower_, THESAURUS, default_return_first_Syn=default_return_first_Syn) for tok in doc]))`

			`def getFirstSynonym(word, thesaurus, default_return_first_Syn=False):`
			`if not isinstance(word, str):`
			`return str(word)`

			`word = word.lower()`

			`# durch den thesaurrus iterieren`
			`for syn_block in thesaurus: # syn_block ist eine liste mit Synonymen`

			`for syn in syn_block:`
			`syn = syn.lower()`
			`if re.match(r'\A[\w-]+\Z', syn): # falls syn einzelwort ist`
			`if word == syn:`
			`return str(getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn))`
			`else: # falls es ein satz ist`
			`if word in syn:`
			`return str(getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn))`
			`return str(word) # zur Not, das ursrpüngliche Wort zurückgeben`

			`def getHauptform(syn_block, word, default_return_first_Syn=False):`
			`for syn in syn_block:`
			`syn = syn.lower()`

			`if "hauptform" in syn and len(syn.split(" ")) <= 2:`
			`# nicht ausgeben, falls es in Klammern steht#todo gibts macnmal?? klammern aus`
			`for w in syn.split(" "):`
			`if not re.match(r'\([^)]+\)', w):`
			`return w`

			`if default_return_first_Syn:`
			`# falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht`
			`for w in syn_block:`
			`if not re.match(r'\([^)]+\)', w):`
			`return w`
			`return word # zur Not, das ursrpüngliche Wort zurückgeben`
			`"""`
bereit für weitern testrun 2017-09-25 13:12:23 +02:00
thesaurus auf basis von deWornNet weitergemacht 2017-09-28 12:42:05 +02:00			`"""`
			`path2xml="/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deWordNet.xml"`

			`tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))`
			`root = tree.getroot()`
thesaurus auf basis von deWornNet angefangen 2017-09-26 11:03:09 +02:00
			`for r in root:`
			`for element in r:`

thesaurus auf basis von deWornNet weitergemacht 2017-09-28 12:42:05 +02:00			`if element.tag == "Synset":`
			`attrib = element.attrib`
thesaurus auf basis von deWornNet angefangen 2017-09-26 11:03:09 +02:00			`for i,subentry in enumerate(element):`
			`if subentry.tag == "Lemma" and subentry.attrib["partOfSpeech"] == "n":`
			`string = (subentry.attrib["writtenForm"])`
			`# replaceRockDots`
			`string = re.sub(r'[ß]', "ss", string)`
			`string = re.sub(r'[ö]', "oe", string)`
			`string = re.sub(r'[ü]', "ue", string)`
			`string = re.sub(r'[ä]', "ae", string)`

			`# seperate_words_on_regex:`
			`string = " ".join(re.compile(regex_specialChars).split(string))`
			`string_list=string.split()`
			`if len(string_list) == 1:`
			`nomen.append(string.lower().strip())`
thesaurus auf basis von deWornNet weitergemacht 2017-09-28 12:42:05 +02:00			`"""`

eigene corpus-save/load methoden geschreiben 2017-10-09 12:50:34 +02:00			`"""`
thesaurus fertiggestellt 2017-10-02 14:31:33 +02:00			`import re`
			`from collections import Counter`
bereit für weitern testrun 2017-09-25 13:12:23 +02:00
thesaurus fertiggestellt 2017-10-02 14:31:33 +02:00			`def words(text): return re.findall(r'\w+', text.lower())`
bereit für weitern testrun 2017-09-25 13:12:23 +02:00
thesaurus fertiggestellt 2017-10-02 14:31:33 +02:00			`WORDS = Counter(words(open('/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt').read()))`
bereit für weitern testrun 2017-09-25 13:12:23 +02:00
thesaurus fertiggestellt 2017-10-02 14:31:33 +02:00			`def P(word, N=sum(WORDS.values())):`
			"Probability of `word`."
			`return WORDS[word] / N`
bereit für weitern testrun 2017-09-25 13:12:23 +02:00
thesaurus fertiggestellt 2017-10-02 14:31:33 +02:00			`def correction(word):`
			`"Most probable spelling correction for word."`
			`return max(candidates(word), key=P)`
bereit für weitern testrun 2017-09-25 13:12:23 +02:00
thesaurus fertiggestellt 2017-10-02 14:31:33 +02:00			`def candidates(word):`
			`"Generate possible spelling corrections for word."`
			`return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])`
bereit für weitern testrun 2017-09-25 13:12:23 +02:00
thesaurus fertiggestellt 2017-10-02 14:31:33 +02:00			`def known(words):`
			"The subset of `words` that appear in the dictionary of WORDS."
			`return set(w for w in words if w in WORDS)`
bereit für weitern testrun 2017-09-25 13:12:23 +02:00
thesaurus fertiggestellt 2017-10-02 14:31:33 +02:00			`def edits1(word):`
			"All edits that are one edit away from `word`."
			`letters = 'abcdefghijklmnopqrstuvwxyz'`
			`splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]`
			`deletes = [L + R[1:] for L, R in splits if R]`
			`transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]`
			`replaces = [L + c + R[1:] for L, R in splits if R for c in letters]`
			`inserts = [L + c + R for L, R in splits for c in letters]`
			`return set(deletes + transposes + replaces + inserts)`
bereit für weitern testrun 2017-09-25 13:12:23 +02:00
thesaurus fertiggestellt 2017-10-02 14:31:33 +02:00			`def edits2(word):`
			"All edits that are two edits away from `word`."
			`return (e2 for e1 in edits1(word) for e2 in edits1(e1))`
bereit für weitern testrun 2017-09-25 13:12:23 +02:00
eigene corpus-save/load methoden geschreiben 2017-10-09 12:50:34 +02:00			`"""`
bereit für weitern testrun 2017-09-25 13:12:23 +02:00
			`"""`
thesaurus auf basis von deWornNet angefangen 2017-09-26 11:03:09 +02:00			`### extract from derewo`
bereit für weitern testrun 2017-09-25 13:12:23 +02:00
thesaurus auf basis von deWornNet angefangen 2017-09-26 11:03:09 +02:00			`#http://www1.ids-mannheim.de/kl/projekte/methoden/derewo.html`
bereit für weitern testrun 2017-09-25 13:12:23 +02:00

thesaurus auf basis von deWornNet angefangen 2017-09-26 11:03:09 +02:00			`raw = textacy.fileio.read_file_lines("DeReKo-2014-II-MainArchive-STT.100000.freq")`
bereit für weitern testrun 2017-09-25 13:12:23 +02:00
thesaurus auf basis von deWornNet angefangen 2017-09-26 11:03:09 +02:00			`for line in raw:`
			`line_list=line.split()`
			`if line_list[2] == "NN":`
			`string = line_list[1].lower()`
bereit für weitern testrun 2017-09-25 13:12:23 +02:00
thesaurus auf basis von deWornNet angefangen 2017-09-26 11:03:09 +02:00			`# replaceRockDots`
			`string = re.sub(r'[ß]', "ss", string)`
			`string = re.sub(r'[ö]', "oe", string)`
			`string = re.sub(r'[ü]', "ue", string)`
			`string = re.sub(r'[ä]', "ae", string)`
bereit für weitern testrun 2017-09-25 13:12:23 +02:00

thesaurus auf basis von deWornNet angefangen 2017-09-26 11:03:09 +02:00			`nomen.append(string.lower().strip())`


			`textacy.fileio.write_file_lines(nomen,"nomen2.txt")`
bereit für weitern testrun 2017-09-25 13:12:23 +02:00			`"""`

			`"""`
bereit für weitern testrun 2017-09-21 12:05:32 +02:00			`stream = textacy.fileio.read_csv("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_2017-09-13.csv", delimiter=";")`
			`content_collumn_name = "Description"`
			`content_collumn = 9 # standardvalue`

			`de_tickets=[]`
			`en_tickets=[]`
			`misc_tickets=[]`

			`error_count = 0`
			`for i, lst in enumerate(stream):`
			`if i == 0:`
			`de_tickets.append(lst)`
			`en_tickets.append(lst)`
			`misc_tickets.append(lst)`
			`else:`
			`try:`
			`content_collumn_ = lst[content_collumn]`
			`if detect(content_collumn_) == "de":`
			`de_tickets.append(lst)`
			`elif detect(content_collumn_) == "en":`
			`en_tickets.append(lst)`
			`else:`
			`misc_tickets.append(lst)`

			`except:`
			`misc_tickets.append(lst)`
			`error_count += 1`

			`print(error_count)`

			`textacy.fileio.write_csv(de_tickets,"M42-Export/de_tickets.csv", delimiter=";")`
			`textacy.fileio.write_csv(en_tickets,"M42-Export/en_tickets.csv", delimiter=";")`
			`textacy.fileio.write_csv(misc_tickets,"M42-Export/misc_tickets.csv", delimiter=";")`
textcleaning macht immer noch keinen spass 2017-09-19 14:42:38 +02:00

bereit für weitern testrun 2017-09-25 13:12:23 +02:00			`"""`
textcleaning macht immer noch keinen spass 2017-09-19 14:42:38 +02:00
			`"""`
			regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"\|</>?]'


			`def stringcleaning(stringstream, funclist):`
			`for string in stringstream:`
			`for f in funclist:`
			`string = f(string)`
			`yield string`


			`def seperate_words_on_regex(regex=regex_specialChars):`
			`return lambda string: " ".join(re.compile(regex).split(string))`


			`words = [`
			`"uniaccount",`
			`"nr54065467",`
			`"nr54065467",`
			`"455a33c5,"`
			`"tvt?=",`
			`"tanja.saborowski@tu-dortmund.de",`
			`"-",`
			`"m-sw1-vl4053.itmc.tu-dortmund.de",`
			`"------problem--------"`
			`]`



			`topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)`
			specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"\|<,./>?]', re.IGNORECASE)

			`for s in stringcleaning((w for w in words),[seperate_words_on_regex()]):`
			`print(s.strip())`

			`#print(stringcleaning(w,string_comp))`
			`#print(bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?',w)))`
			#print(bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"\|<,./>?]',w)))
			`#result = specialFinder.sub(" ", w)`
			#print(re.sub(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"\|<,./>?]'," ",w))

			`#print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w))`
lemmatizer gebaut 2017-09-20 15:22:13 +02:00			`"""`

			`"""`
bereit für weitern testrun 2017-09-21 12:05:32 +02:00			`def replaceRockDots():`
			`return lambda string: re.sub(r'[ß]', "ss", (re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower())))))))`
lemmatizer gebaut 2017-09-20 15:22:13 +02:00


bereit für weitern testrun 2017-09-21 12:05:32 +02:00			`de_stop_words = list(textacy.fileio.read_file_lines(filepath="german_stopwords_full.txt"))`
lemmatizer gebaut 2017-09-20 15:22:13 +02:00

			`#blob = Text(str(textacy.fileio.read_file("teststring.txt")))#,parser=PatternParser(pprint=True, lemmata=True))`

			`#print(blob.entities)`

			`de_stop_words = list(map(replaceRockDots(),de_stop_words))`
bereit für weitern testrun 2017-09-21 12:05:32 +02:00			`#LEMMAS = list(map(replaceRockDots(),LEMMAS))`
			`#VORNAMEN = list(map(replaceRockDots(),VORNAMEN))`
lemmatizer gebaut 2017-09-20 15:22:13 +02:00
			`de_stop_words = list(map(textacy.preprocess.normalize_whitespace,de_stop_words))`
bereit für weitern testrun 2017-09-21 12:05:32 +02:00			`#LEMMAS = list(map(textacy.preprocess.normalize_whitespace,LEMMAS))`
			`#VORNAMEN = list(map(textacy.preprocess.normalize_whitespace,VORNAMEN))`
lemmatizer gebaut 2017-09-20 15:22:13 +02:00



bereit für weitern testrun 2017-09-21 12:05:32 +02:00			`#textacy.fileio.write_file_lines(LEMMAS,"lemmas.txt")`
			`#textacy.fileio.write_file_lines(VORNAMEN,"firstnames.txt")`
			`textacy.fileio.write_file_lines(de_stop_words,"german_stopwords.txt")`
lemmatizer gebaut 2017-09-20 15:22:13 +02:00
bereit für weitern testrun 2017-09-21 12:05:32 +02:00			`"""`
lemmatizer gebaut 2017-09-20 15:22:13 +02:00			`end = time.time()`
			`print("\n\n\nTime Elapsed Topic:{0}\n\n".format(end - start))`

thesaurus fertiggestellt 2017-10-02 14:31:33 +02:00