topicModelingTickets/testra.py

93 lines
1.9 KiB
Python

# -*- coding: utf-8 -*-
import corenlp as corenlp
import os
import re
import textacy
import nltk
from textblob_de import TextBlobDE
from textblob_de import PatternParser
filepath = "lemmatization-de.txt"
blob = TextBlobDE(str(textacy.fileio.read_file("teststring.txt")),parser=PatternParser(pprint=True, lemmata=True))
print(blob.parse())
#erste spalte zu {lemma : id} . zweite spalte zu {word : id}
"""http://www.lexiconista.com/datasets/lemmatization/"""
lemma2id = {}
word2id = {}
for id,line in enumerate(list(textacy.fileio.read_file_lines(filepath=filepath))):
lemma = line.split()[0].strip().lower()
if lemma not in lemma2id:
lemma2id[lemma] = id
word = line.split()[1].strip().lower()
word2id[word] = lemma2id[word]
"""
regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|</>?]'
def stringcleaning(stringstream, funclist):
for string in stringstream:
for f in funclist:
string = f(string)
yield string
def seperate_words_on_regex(regex=regex_specialChars):
return lambda string: " ".join(re.compile(regex).split(string))
words = [
"uniaccount",
"nr54065467",
"nr54065467",
"455a33c5,"
"tvt?=",
"tanja.saborowski@tu-dortmund.de",
"-",
"m-sw1-vl4053.itmc.tu-dortmund.de",
"------problem--------"
]
topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)
for s in stringcleaning((w for w in words),[seperate_words_on_regex()]):
print(s.strip())
#print(stringcleaning(w,string_comp))
#print(bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?',w)))
#print(bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]',w)))
#result = specialFinder.sub(" ", w)
#print(re.sub(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]'," ",w))
#print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w))
"""