chg: [feature] add punct statistics for the oov (but the punct in

spacy.io seems super buggy or incorrect)
This commit is contained in:
Alexandre Dulaunoy 2020-08-20 14:33:15 +02:00
parent 526f88071c
commit 3c3760019e
Signed by: adulau
GPG key ID: 09E2CD4944E6CBCD

View file

@ -36,7 +36,9 @@ with open(args.f, 'r') as file:
doc = nlp(text) doc = nlp(text)
analysis = ["verb:napkin", "noun:napkin", "hashtag:napkin", "mention:napkin", "digit:napkin", "url:napking", "oov:napkin", "labels:napkin"] analysis = ["verb:napkin", "noun:napkin", "hashtag:napkin", "mention:napkin",
"digit:napkin", "url:napking", "oov:napkin", "labels:napkin",
"punct:napkin"]
redisdb.hset("stats", "token", doc.__len__()) redisdb.hset("stats", "token", doc.__len__())
@ -75,6 +77,11 @@ for token in doc:
redisdb.zincrby("email:napkin", 1, value) redisdb.zincrby("email:napkin", 1, value)
redisdb.hincrby("stats", "email:napkin", 1) redisdb.hincrby("stats", "email:napkin", 1)
continue continue
if token.is_punct:
redisdb.zincrby("punct:napkin", 1, value)
redisdb.hincrby("stats", "punct:napkin", 1)
continue
redisdb.zincrby("oov:napkin", 1, value) redisdb.zincrby("oov:napkin", 1, value)
redisdb.hincrby("stats", "oov:napkin", 1) redisdb.hincrby("stats", "oov:napkin", 1)