eigene corpus-save/load methoden geschreiben
This commit is contained in:
parent
3bfbebc894
commit
b542c4285a
Binary file not shown.
22
testo.py
22
testo.py
|
@ -7,8 +7,8 @@ print(datetime.now())
|
||||||
|
|
||||||
|
|
||||||
#path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv"
|
#path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv"
|
||||||
path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv"
|
#path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv"
|
||||||
#path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv"
|
path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv"
|
||||||
|
|
||||||
path_csv_split = path2csv.split("/")
|
path_csv_split = path2csv.split("/")
|
||||||
print(path_csv_split[len(path_csv_split)-1])
|
print(path_csv_split[len(path_csv_split)-1])
|
||||||
|
@ -501,7 +501,7 @@ def build_thesaurus(path2lexicalentries, path2synsets):
|
||||||
|
|
||||||
|
|
||||||
THESAURUS=[]
|
THESAURUS=[]
|
||||||
#THESAURUS=build_thesaurus(path2lexicalentries=lexicalentries,path2synsets=synsets) #todo anschalten
|
THESAURUS=build_thesaurus(path2lexicalentries=lexicalentries,path2synsets=synsets)
|
||||||
|
|
||||||
def getFirstSynonym(word, thesaurus=THESAURUS):
|
def getFirstSynonym(word, thesaurus=THESAURUS):
|
||||||
if not isinstance(word, str):
|
if not isinstance(word, str):
|
||||||
|
@ -541,7 +541,7 @@ def words(text): return re.findall(r'\w+', text.lower())
|
||||||
|
|
||||||
|
|
||||||
WORDS={}
|
WORDS={}
|
||||||
#WORDS = Counter(words(open(path2words).read())) #todo anschalten
|
WORDS = Counter(words(open(path2words).read()))
|
||||||
|
|
||||||
def P(word, N=sum(WORDS.values())):
|
def P(word, N=sum(WORDS.values())):
|
||||||
"Probability of `word`."
|
"Probability of `word`."
|
||||||
|
@ -629,10 +629,10 @@ def stringcleaning(stringstream):
|
||||||
string = " ".join([lemmatizeWord(word) for word in string.split()])
|
string = " ".join([lemmatizeWord(word) for word in string.split()])
|
||||||
|
|
||||||
# synonyme normalisieren #idee vor oder nach lemmatize?
|
# synonyme normalisieren #idee vor oder nach lemmatize?
|
||||||
#string = " ".join([getFirstSynonym(word) for word in string.split()])
|
string = " ".join([getFirstSynonym(word) for word in string.split()])
|
||||||
|
|
||||||
# autocorrect
|
# autocorrect
|
||||||
#string = " ".join([autocorrectWord(word) for word in string.split()])
|
string = " ".join([autocorrectWord(word) for word in string.split()])
|
||||||
|
|
||||||
yield string
|
yield string
|
||||||
|
|
||||||
|
@ -745,7 +745,7 @@ custom_words=["geehrt","dame","herr","hilfe","problem","lauten","bedanken","vora
|
||||||
"auffahren","vorgang","hinweis","institut","universitaet","name","gruss","id","erfolg","mail","folge",
|
"auffahren","vorgang","hinweis","institut","universitaet","name","gruss","id","erfolg","mail","folge",
|
||||||
"nummer","team","fakultaet","email","absender","tu","versenden","vorname","message",
|
"nummer","team","fakultaet","email","absender","tu","versenden","vorname","message",
|
||||||
"service","strasse","prozess","portal","raum","personal","moeglichkeit","fremd","wende","rueckfrage", "stehen", "verfuegung"
|
"service","strasse","prozess","portal","raum","personal","moeglichkeit","fremd","wende","rueckfrage", "stehen", "verfuegung"
|
||||||
"funktionieren","kollege", "pruefen"
|
"funktionieren","kollege", "pruefen","hoffen"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -896,9 +896,15 @@ weighting = 'tf'
|
||||||
# weighting ='tfidf'
|
# weighting ='tfidf'
|
||||||
named_entities = False
|
named_entities = False
|
||||||
|
|
||||||
#printvecotorization(ngrams=ngrams,min_df=min_df,max_df=max_df,weighting=weighting,named_entities=named_entities)
|
printvecotorization(ngrams=1,min_df=1,max_df=1.0,weighting=weighting)
|
||||||
|
printvecotorization(ngrams=1,min_df=1,max_df=0.5,weighting=weighting)
|
||||||
|
printvecotorization(ngrams=1,min_df=1,max_df=0.8,weighting=weighting)
|
||||||
|
|
||||||
|
|
||||||
|
printvecotorization(ngrams=(1,2),min_df=1,max_df=1.0,weighting=weighting)
|
||||||
|
printvecotorization(ngrams=(1,2),min_df=1,max_df=0.5,weighting=weighting)
|
||||||
|
printvecotorization(ngrams=(1,2),min_df=1,max_df=0.8,weighting=weighting)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
116
testra.py
116
testra.py
|
@ -1,6 +1,7 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
|
import json
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
import textacy
|
import textacy
|
||||||
|
@ -16,7 +17,116 @@ import xml.etree.ElementTree as ET
|
||||||
print(datetime.now())
|
print(datetime.now())
|
||||||
|
|
||||||
|
|
||||||
#PARSER=spacy.load("de")
|
PARSER=spacy.load("de")
|
||||||
|
|
||||||
|
|
||||||
|
corpus = textacy.Corpus(PARSER)
|
||||||
|
|
||||||
|
testcontetn = [
|
||||||
|
"fdsfdsfsd",
|
||||||
|
"juzdtjlkö",
|
||||||
|
"gfadojplk"
|
||||||
|
]
|
||||||
|
|
||||||
|
testmetda = [
|
||||||
|
{"categoryName":"zhb","Solution":"","Subject":"schulungstest"},
|
||||||
|
{"categoryName":"neuanschluss","Solution":"subject","Subject":"telephone contract"},
|
||||||
|
{"categoryName":"zhb","Solution":"","Subject":"setuji"}
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def makecontent(testcontetn):
|
||||||
|
for content in testcontetn:
|
||||||
|
yield content
|
||||||
|
|
||||||
|
|
||||||
|
def makemeta( testmetda):
|
||||||
|
for metdata in testmetda:
|
||||||
|
yield metdata
|
||||||
|
|
||||||
|
|
||||||
|
corpus.add_texts(
|
||||||
|
makecontent(testcontetn),
|
||||||
|
makemeta(testmetda)
|
||||||
|
)
|
||||||
|
|
||||||
|
print(corpus)
|
||||||
|
|
||||||
|
|
||||||
|
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
|
||||||
|
corpus_name = "testcorpus"
|
||||||
|
|
||||||
|
"""
|
||||||
|
#corpus.save(corpus_path, name=corpus_name, compression=corpus_compression)
|
||||||
|
#corpus = textacy.Corpus.load(corpus_path, name=corpus_name, compression=corpus_compression)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
import pathlib
|
||||||
|
|
||||||
|
strings_path = pathlib.Path(corpus_path + 'strings.json')
|
||||||
|
path_lexemes_bin_ = pathlib.Path(corpus_path + 'lexemes.bin')
|
||||||
|
|
||||||
|
PARSER.vocab.dump(path_lexemes_bin_)
|
||||||
|
nlp.vocab.load_lexemes(path_lexemes_bin_)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def save_corpus(corpus_path,corpus_name):
|
||||||
|
|
||||||
|
# save stringstore
|
||||||
|
stringstore_path = corpus_path + corpus_name + '_strings.json'
|
||||||
|
with open(stringstore_path, "w") as file:
|
||||||
|
PARSER.vocab.strings.dump(file)
|
||||||
|
|
||||||
|
|
||||||
|
#save content
|
||||||
|
contentpath = corpus_path + corpus_name+ "_content.bin"
|
||||||
|
textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus),contentpath)
|
||||||
|
|
||||||
|
|
||||||
|
#save meta
|
||||||
|
metapath = corpus_path + corpus_name +"_meta.json"
|
||||||
|
textacy.fileio.write_json_lines((doc.metadata for doc in corpus), metapath)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def load_corpus(corpus_path,corpus_name):
|
||||||
|
# load new lang
|
||||||
|
nlp = spacy.load("de")
|
||||||
|
|
||||||
|
#load stringstore
|
||||||
|
stringstore_path = corpus_path + corpus_name + '_strings.json'
|
||||||
|
with open(stringstore_path,"r") as file:
|
||||||
|
nlp.vocab.strings.load(file)
|
||||||
|
|
||||||
|
# define corpus
|
||||||
|
corpus = textacy.Corpus(nlp)
|
||||||
|
|
||||||
|
# load meta
|
||||||
|
metapath = corpus_path + corpus_name +"_meta.json"
|
||||||
|
metadata_stream = textacy.fileio.read_json_lines(metapath)
|
||||||
|
|
||||||
|
#load content
|
||||||
|
contentpath = corpus_path + corpus_name+ "_content.bin"
|
||||||
|
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
|
||||||
|
|
||||||
|
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
|
||||||
|
corpus.add_doc(
|
||||||
|
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
|
||||||
|
|
||||||
|
return corpus
|
||||||
|
|
||||||
|
|
||||||
|
save_corpus(corpus_path,corpus_name)
|
||||||
|
|
||||||
|
print(load_corpus(corpus_path,corpus_name))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#idee das auch mit spellchecker, lemmetaizer und thesaurus machen wegen memory
|
||||||
|
# todo generators immer neu laden? wegen laufzeit-nacheinander-picking, denn sonst nicht det
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
@ -97,7 +207,7 @@ for r in root:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
import re
|
import re
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
|
|
||||||
|
@ -135,7 +245,7 @@ def edits2(word):
|
||||||
"All edits that are two edits away from `word`."
|
"All edits that are two edits away from `word`."
|
||||||
return (e2 for e1 in edits1(word) for e2 in edits1(e1))
|
return (e2 for e1 in edits1(word) for e2 in edits1(e1))
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue