topicModelingTickets/testra.py

# -*- coding: utf-8 -*-
import corenlp as corenlp
import os
import re
import textacy
import nltk
from textblob_de import TextBlobDE
from textblob_de import PatternParser

filepath = "lemmatization-de.txt"


blob = TextBlobDE(str(textacy.fileio.read_file("teststring.txt")),parser=PatternParser(pprint=True, lemmata=True))

print(blob.parse())


#erste spalte zu {lemma : id} . zweite spalte zu {word : id}


"""http://www.lexiconista.com/datasets/lemmatization/"""

lemma2id = {}
word2id = {}

for id,line in enumerate(list(textacy.fileio.read_file_lines(filepath=filepath))):

    lemma = line.split()[0].strip().lower()
    if lemma not in lemma2id:
        lemma2id[lemma] = id

    word = line.split()[1].strip().lower()

    word2id[word] = lemma2id[word]


"""
regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|</>?]'


def stringcleaning(stringstream, funclist):
    for string in stringstream:
        for f in funclist:
            string = f(string)
        yield string


def seperate_words_on_regex(regex=regex_specialChars):
    return lambda string: " ".join(re.compile(regex).split(string))


words = [
    "uniaccount",
    "nr54065467",
    "nr54065467",
    "455a33c5,"
    "tvt?=",
    "tanja.saborowski@tu-dortmund.de",
    "-",
    "m-sw1-vl4053.itmc.tu-dortmund.de",
    "------problem--------"
]


topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)

for s in stringcleaning((w for w in words),[seperate_words_on_regex()]):
    print(s.strip())

    #print(stringcleaning(w,string_comp))
    #print(bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?',w)))
    #print(bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]',w)))
    #result = specialFinder.sub(" ", w)
    #print(re.sub(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]'," ",w))

    #print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w))
"""
textcleaning macht immer noch keinen spass 2017-09-19 14:42:38 +02:00			`# -- coding: utf-8 --`
			`import corenlp as corenlp`
			`import os`
			`import re`
			`import textacy`
			`import nltk`
			`from textblob_de import TextBlobDE`
			`from textblob_de import PatternParser`

			`filepath = "lemmatization-de.txt"`




			`blob = TextBlobDE(str(textacy.fileio.read_file("teststring.txt")),parser=PatternParser(pprint=True, lemmata=True))`

			`print(blob.parse())`





			`#erste spalte zu {lemma : id} . zweite spalte zu {word : id}`



			`"""http://www.lexiconista.com/datasets/lemmatization/"""`

			`lemma2id = {}`
			`word2id = {}`

			`for id,line in enumerate(list(textacy.fileio.read_file_lines(filepath=filepath))):`

			`lemma = line.split()[0].strip().lower()`
			`if lemma not in lemma2id:`
			`lemma2id[lemma] = id`

			`word = line.split()[1].strip().lower()`

			`word2id[word] = lemma2id[word]`










			`"""`
			regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"\|</>?]'


			`def stringcleaning(stringstream, funclist):`
			`for string in stringstream:`
			`for f in funclist:`
			`string = f(string)`
			`yield string`


			`def seperate_words_on_regex(regex=regex_specialChars):`
			`return lambda string: " ".join(re.compile(regex).split(string))`


			`words = [`
			`"uniaccount",`
			`"nr54065467",`
			`"nr54065467",`
			`"455a33c5,"`
			`"tvt?=",`
			`"tanja.saborowski@tu-dortmund.de",`
			`"-",`
			`"m-sw1-vl4053.itmc.tu-dortmund.de",`
			`"------problem--------"`
			`]`



			`topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)`
			specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"\|<,./>?]', re.IGNORECASE)

			`for s in stringcleaning((w for w in words),[seperate_words_on_regex()]):`
			`print(s.strip())`

			`#print(stringcleaning(w,string_comp))`
			`#print(bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?',w)))`
			#print(bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"\|<,./>?]',w)))
			`#result = specialFinder.sub(" ", w)`
			#print(re.sub(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"\|<,./>?]'," ",w))

			`#print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w))`
			`"""`