From 3c3760019ed5b7da92c1caa99af81e0687109863 Mon Sep 17 00:00:00 2001 From: Alexandre Dulaunoy Date: Thu, 20 Aug 2020 14:33:15 +0200 Subject: [PATCH] chg: [feature] add punct statistics for the oov (but the punct in spacy.io seems super buggy or incorrect) --- bin/napkin.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/bin/napkin.py b/bin/napkin.py index 83fa89b..60c1b1a 100644 --- a/bin/napkin.py +++ b/bin/napkin.py @@ -36,7 +36,9 @@ with open(args.f, 'r') as file: doc = nlp(text) -analysis = ["verb:napkin", "noun:napkin", "hashtag:napkin", "mention:napkin", "digit:napkin", "url:napking", "oov:napkin", "labels:napkin"] +analysis = ["verb:napkin", "noun:napkin", "hashtag:napkin", "mention:napkin", + "digit:napkin", "url:napking", "oov:napkin", "labels:napkin", + "punct:napkin"] redisdb.hset("stats", "token", doc.__len__()) @@ -75,6 +77,11 @@ for token in doc: redisdb.zincrby("email:napkin", 1, value) redisdb.hincrby("stats", "email:napkin", 1) continue + if token.is_punct: + redisdb.zincrby("punct:napkin", 1, value) + redisdb.hincrby("stats", "punct:napkin", 1) + continue + redisdb.zincrby("oov:napkin", 1, value) redisdb.hincrby("stats", "oov:napkin", 1)