eigene corpus-save/load methoden geschreiben
This commit is contained in:
parent
3bfbebc894
commit
b542c4285a
Binary file not shown.
22
testo.py
22
testo.py
|
@ -7,8 +7,8 @@ print(datetime.now())
|
|||
|
||||
|
||||
#path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv"
|
||||
path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv"
|
||||
#path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv"
|
||||
#path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv"
|
||||
path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv"
|
||||
|
||||
path_csv_split = path2csv.split("/")
|
||||
print(path_csv_split[len(path_csv_split)-1])
|
||||
|
@ -501,7 +501,7 @@ def build_thesaurus(path2lexicalentries, path2synsets):
|
|||
|
||||
|
||||
THESAURUS=[]
|
||||
#THESAURUS=build_thesaurus(path2lexicalentries=lexicalentries,path2synsets=synsets) #todo anschalten
|
||||
THESAURUS=build_thesaurus(path2lexicalentries=lexicalentries,path2synsets=synsets)
|
||||
|
||||
def getFirstSynonym(word, thesaurus=THESAURUS):
|
||||
if not isinstance(word, str):
|
||||
|
@ -541,7 +541,7 @@ def words(text): return re.findall(r'\w+', text.lower())
|
|||
|
||||
|
||||
WORDS={}
|
||||
#WORDS = Counter(words(open(path2words).read())) #todo anschalten
|
||||
WORDS = Counter(words(open(path2words).read()))
|
||||
|
||||
def P(word, N=sum(WORDS.values())):
|
||||
"Probability of `word`."
|
||||
|
@ -629,10 +629,10 @@ def stringcleaning(stringstream):
|
|||
string = " ".join([lemmatizeWord(word) for word in string.split()])
|
||||
|
||||
# synonyme normalisieren #idee vor oder nach lemmatize?
|
||||
#string = " ".join([getFirstSynonym(word) for word in string.split()])
|
||||
string = " ".join([getFirstSynonym(word) for word in string.split()])
|
||||
|
||||
# autocorrect
|
||||
#string = " ".join([autocorrectWord(word) for word in string.split()])
|
||||
string = " ".join([autocorrectWord(word) for word in string.split()])
|
||||
|
||||
yield string
|
||||
|
||||
|
@ -745,7 +745,7 @@ custom_words=["geehrt","dame","herr","hilfe","problem","lauten","bedanken","vora
|
|||
"auffahren","vorgang","hinweis","institut","universitaet","name","gruss","id","erfolg","mail","folge",
|
||||
"nummer","team","fakultaet","email","absender","tu","versenden","vorname","message",
|
||||
"service","strasse","prozess","portal","raum","personal","moeglichkeit","fremd","wende","rueckfrage", "stehen", "verfuegung"
|
||||
"funktionieren","kollege", "pruefen"
|
||||
"funktionieren","kollege", "pruefen","hoffen"
|
||||
]
|
||||
|
||||
|
||||
|
@ -896,9 +896,15 @@ weighting = 'tf'
|
|||
# weighting ='tfidf'
|
||||
named_entities = False
|
||||
|
||||
#printvecotorization(ngrams=ngrams,min_df=min_df,max_df=max_df,weighting=weighting,named_entities=named_entities)
|
||||
printvecotorization(ngrams=1,min_df=1,max_df=1.0,weighting=weighting)
|
||||
printvecotorization(ngrams=1,min_df=1,max_df=0.5,weighting=weighting)
|
||||
printvecotorization(ngrams=1,min_df=1,max_df=0.8,weighting=weighting)
|
||||
|
||||
|
||||
printvecotorization(ngrams=(1,2),min_df=1,max_df=1.0,weighting=weighting)
|
||||
printvecotorization(ngrams=(1,2),min_df=1,max_df=0.5,weighting=weighting)
|
||||
printvecotorization(ngrams=(1,2),min_df=1,max_df=0.8,weighting=weighting)
|
||||
|
||||
|
||||
|
||||
"""
|
||||
|
|
116
testra.py
116
testra.py
|
@ -1,6 +1,7 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import time
|
||||
import json
|
||||
|
||||
import spacy
|
||||
import textacy
|
||||
|
@ -16,7 +17,116 @@ import xml.etree.ElementTree as ET
|
|||
print(datetime.now())
|
||||
|
||||
|
||||
#PARSER=spacy.load("de")
|
||||
PARSER=spacy.load("de")
|
||||
|
||||
|
||||
corpus = textacy.Corpus(PARSER)
|
||||
|
||||
testcontetn = [
|
||||
"fdsfdsfsd",
|
||||
"juzdtjlkö",
|
||||
"gfadojplk"
|
||||
]
|
||||
|
||||
testmetda = [
|
||||
{"categoryName":"zhb","Solution":"","Subject":"schulungstest"},
|
||||
{"categoryName":"neuanschluss","Solution":"subject","Subject":"telephone contract"},
|
||||
{"categoryName":"zhb","Solution":"","Subject":"setuji"}
|
||||
]
|
||||
|
||||
|
||||
def makecontent(testcontetn):
|
||||
for content in testcontetn:
|
||||
yield content
|
||||
|
||||
|
||||
def makemeta( testmetda):
|
||||
for metdata in testmetda:
|
||||
yield metdata
|
||||
|
||||
|
||||
corpus.add_texts(
|
||||
makecontent(testcontetn),
|
||||
makemeta(testmetda)
|
||||
)
|
||||
|
||||
print(corpus)
|
||||
|
||||
|
||||
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
|
||||
corpus_name = "testcorpus"
|
||||
|
||||
"""
|
||||
#corpus.save(corpus_path, name=corpus_name, compression=corpus_compression)
|
||||
#corpus = textacy.Corpus.load(corpus_path, name=corpus_name, compression=corpus_compression)
|
||||
|
||||
|
||||
|
||||
import pathlib
|
||||
|
||||
strings_path = pathlib.Path(corpus_path + 'strings.json')
|
||||
path_lexemes_bin_ = pathlib.Path(corpus_path + 'lexemes.bin')
|
||||
|
||||
PARSER.vocab.dump(path_lexemes_bin_)
|
||||
nlp.vocab.load_lexemes(path_lexemes_bin_)
|
||||
"""
|
||||
|
||||
def save_corpus(corpus_path,corpus_name):
|
||||
|
||||
# save stringstore
|
||||
stringstore_path = corpus_path + corpus_name + '_strings.json'
|
||||
with open(stringstore_path, "w") as file:
|
||||
PARSER.vocab.strings.dump(file)
|
||||
|
||||
|
||||
#save content
|
||||
contentpath = corpus_path + corpus_name+ "_content.bin"
|
||||
textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus),contentpath)
|
||||
|
||||
|
||||
#save meta
|
||||
metapath = corpus_path + corpus_name +"_meta.json"
|
||||
textacy.fileio.write_json_lines((doc.metadata for doc in corpus), metapath)
|
||||
|
||||
|
||||
|
||||
def load_corpus(corpus_path,corpus_name):
|
||||
# load new lang
|
||||
nlp = spacy.load("de")
|
||||
|
||||
#load stringstore
|
||||
stringstore_path = corpus_path + corpus_name + '_strings.json'
|
||||
with open(stringstore_path,"r") as file:
|
||||
nlp.vocab.strings.load(file)
|
||||
|
||||
# define corpus
|
||||
corpus = textacy.Corpus(nlp)
|
||||
|
||||
# load meta
|
||||
metapath = corpus_path + corpus_name +"_meta.json"
|
||||
metadata_stream = textacy.fileio.read_json_lines(metapath)
|
||||
|
||||
#load content
|
||||
contentpath = corpus_path + corpus_name+ "_content.bin"
|
||||
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
|
||||
|
||||
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
|
||||
corpus.add_doc(
|
||||
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
|
||||
|
||||
return corpus
|
||||
|
||||
|
||||
save_corpus(corpus_path,corpus_name)
|
||||
|
||||
print(load_corpus(corpus_path,corpus_name))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#idee das auch mit spellchecker, lemmetaizer und thesaurus machen wegen memory
|
||||
# todo generators immer neu laden? wegen laufzeit-nacheinander-picking, denn sonst nicht det
|
||||
|
||||
|
||||
"""
|
||||
|
@ -97,7 +207,7 @@ for r in root:
|
|||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
import re
|
||||
from collections import Counter
|
||||
|
||||
|
@ -135,7 +245,7 @@ def edits2(word):
|
|||
"All edits that are two edits away from `word`."
|
||||
return (e2 for e1 in edits1(word) for e2 in edits1(e1))
|
||||
|
||||
|
||||
"""
|
||||
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue