mirror of
https://github.com/adulau/napkin-text-analysis.git
synced 2024-11-22 01:47:06 +00:00
new: [napkin] first release
Napkin is a Python tool to produce statistical analysis of a text. Analysis features are : - Verbs frequency - Nouns frequency - Digit frequency - Labels frequency such as (Person, organisation, product, location) as defined in spacy.io [named entities](https://spacy.io/api/annotation#named-entities) - URL frequency - Email frequency - Mention frequency (everything prefixed with an @ symbol) - Out-Of-Vocabulary (OOV) word frequency meaning any words outside English dictionary
This commit is contained in:
parent
e3e27c7ce9
commit
dd7c796460
3 changed files with 3557 additions and 1 deletions
166
README.md
166
README.md
|
@ -1,2 +1,166 @@
|
|||
# napkin-text-analysis
|
||||
Napking is a simple tool to produce statistical analysis of a text
|
||||
|
||||
Napkin is a Python tool to produce statistical analysis of a text.
|
||||
|
||||
Analysis features are :
|
||||
|
||||
- Verbs frequency
|
||||
- Nouns frequency
|
||||
- Digit frequency
|
||||
- Labels frequency such as (Person, organisation, product, location) as defined in spacy.io [named entities](https://spacy.io/api/annotation#named-entities)
|
||||
- URL frequency
|
||||
- Email frequency
|
||||
- Mention frequency (everything prefixed with an @ symbol)
|
||||
- Out-Of-Vocabulary (OOV) word frequency meaning any words outside English dictionary
|
||||
|
||||
# requirements
|
||||
|
||||
- Python >= 3.6
|
||||
- spacy.io
|
||||
- redis (a redis server running on port 6380)
|
||||
|
||||
# how to use napkin
|
||||
|
||||
~~~~
|
||||
usage: napkin.py [-h] [-v V] [-f F] [-t T] [-o O]
|
||||
|
||||
Extract statistical analysis of text
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
-v V verbose output
|
||||
-f F file to analyse
|
||||
-t T maximum value for the top list (default is 100) -1 is no limit
|
||||
-o O output format (default is csv)
|
||||
~~~~
|
||||
|
||||
# example usage of napkin
|
||||
|
||||
A sample file "The Prince, by Nicoló Machiavelli" is included to test napkin.
|
||||
|
||||
`python3 napkin.py -f ../samples/the-prince.txt`
|
||||
|
||||
Example output:
|
||||
|
||||
~~~~
|
||||
# Top 100 of verb:napkin
|
||||
b'can',137.0
|
||||
b'make',116.0
|
||||
b'may',106.0
|
||||
b'would',102.0
|
||||
b'must',97.0
|
||||
b'take',86.0
|
||||
b'have',73.0
|
||||
b'see',72.0
|
||||
b'become',62.0
|
||||
b'find',61.0
|
||||
b'know',59.0
|
||||
b'should',54.0
|
||||
b'keep',53.0
|
||||
b'give',53.0
|
||||
b'hold',51.0
|
||||
b'say',50.0
|
||||
b'wish',48.0
|
||||
b'could',48.0
|
||||
b'fear',46.0
|
||||
b'maintain',45.0
|
||||
b'think',42.0
|
||||
b'use',40.0
|
||||
b'consider',40.0
|
||||
b'come',40.0
|
||||
b'lose',37.0
|
||||
b'live',35.0
|
||||
b'follow',33.0
|
||||
b'do',33.0
|
||||
b'remain',32.0
|
||||
b'gain',31.0
|
||||
b'avoid',31.0
|
||||
b'arise',31.0
|
||||
b'speak',29.0
|
||||
...
|
||||
# Top 100 of noun:napkin
|
||||
b'man',120.0
|
||||
b'state',108.0
|
||||
b'people',90.0
|
||||
b'one',90.0
|
||||
b'time',85.0
|
||||
b'work',83.0
|
||||
b'other',82.0
|
||||
b'thing',71.0
|
||||
b'way',60.0
|
||||
b'order',57.0
|
||||
b'fortune',49.0
|
||||
b'army',45.0
|
||||
b'force',44.0
|
||||
b'arm',44.0
|
||||
b'soldier',43.0
|
||||
b'subject',42.0
|
||||
b'power',41.0
|
||||
b'difficulty',39.0
|
||||
b'law',34.0
|
||||
b'reputation',33.0
|
||||
b'position',33.0
|
||||
b'enemy',33.0
|
||||
b'war',32.0
|
||||
b'kingdom',32.0
|
||||
b'cause',31.0
|
||||
b'possession',29.0
|
||||
b'action',29.0
|
||||
b'ruler',28.0
|
||||
b'rule',28.0
|
||||
b'example',28.0
|
||||
b'hand',27.0
|
||||
b'friend',27.0
|
||||
b'country',27.0
|
||||
b'king',26.0
|
||||
b'case',26.0
|
||||
...
|
||||
# Top 100 of digit:napkin
|
||||
b'84116',1.0
|
||||
b'750175',1.0
|
||||
b'6221541',1.0
|
||||
b'57037',1.0
|
||||
b'55901',1.0
|
||||
#
|
||||
# Top 100 of url:napking
|
||||
#
|
||||
# Top 100 of oov:napkin
|
||||
b'Fermo',7.0
|
||||
b'Vitelli',6.0
|
||||
b'Pertinax',6.0
|
||||
b'Orsinis',6.0
|
||||
b'Colonnas',6.0
|
||||
b'Bentivogli',6.0
|
||||
b'Agathocles',6.0
|
||||
b'Oliverotto',5.0
|
||||
b'C\xc3\xa6sar',5.0
|
||||
...
|
||||
# Top 100 of labels:napkin
|
||||
b'GPE',305.0
|
||||
b'CARDINAL',197.0
|
||||
b'ORG',189.0
|
||||
b'NORP',131.0
|
||||
b'ORDINAL',72.0
|
||||
b'DATE',44.0
|
||||
b'LAW',30.0
|
||||
b'LOC',18.0
|
||||
b'PRODUCT',9.0
|
||||
b'LANGUAGE',5.0
|
||||
b'WORK_OF_ART',4.0
|
||||
b'QUANTITY',4.0
|
||||
b'TIME',3.0
|
||||
b'FAC',3.0
|
||||
b'MONEY',2.0
|
||||
b'PERCENT',1.0
|
||||
b'EVENT',1.0
|
||||
|
||||
~~~~
|
||||
|
||||
# LICENSE
|
||||
|
||||
napkin is free software under the AGPLv3 license.
|
||||
|
||||
~~~~
|
||||
Copyright (C) 2020 Alexandre Dulaunoy
|
||||
Copyright (C) 2020 Pauline Bourmeau
|
||||
~~~~
|
||||
|
|
80
bin/napkin.py
Normal file
80
bin/napkin.py
Normal file
|
@ -0,0 +1,80 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import redis
|
||||
import spacy
|
||||
from spacy_langdetect import LanguageDetector
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
parser = argparse.ArgumentParser(description="Extract statistical analysis of text")
|
||||
parser.add_argument('-v', help="verbose output")
|
||||
parser.add_argument('-f', help="file to analyse")
|
||||
parser.add_argument('-t', help="maximum value for the top list (default is 100) -1 is no limit", default=100)
|
||||
parser.add_argument('-o', help="output format (default is csv)", default="csv")
|
||||
args = parser.parse_args()
|
||||
if args.f is None:
|
||||
parser.print_help()
|
||||
sys.exit()
|
||||
|
||||
redisdb = redis.Redis(host="localhost", port=6380, db=5)
|
||||
|
||||
try:
|
||||
redisdb.flushdb()
|
||||
except:
|
||||
print("Redis database on port 6380 is not running...", file=sys.stderr)
|
||||
sys.exit()
|
||||
|
||||
nlp = spacy.load("en_core_web_md")
|
||||
nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)
|
||||
|
||||
nlp.max_length = 2000000
|
||||
|
||||
with open(args.f, 'r') as file:
|
||||
text = file.read()
|
||||
|
||||
doc = nlp(text)
|
||||
|
||||
analysis = ["verb:napkin", "noun:napkin", "hashtag:napkin", "mention:napkin", "digit:napkin", "url:napking", "oov:napkin", "labels:napkin"]
|
||||
|
||||
for token in doc:
|
||||
if token.pos_ == "VERB" and not token.is_oov:
|
||||
redisdb.zincrby("verb:napkin", 1, token.lemma_)
|
||||
continue
|
||||
if token.pos_ == "NOUN" and not token.is_oov:
|
||||
redisdb.zincrby("noun:napkin", 1, token.lemma_)
|
||||
continue
|
||||
|
||||
if token.is_oov:
|
||||
value = "{}".format(token)
|
||||
if value.startswith('#'):
|
||||
redisdb.zincrby("hashtag:napkin", 1, value[1:])
|
||||
continue
|
||||
if value.startswith('@'):
|
||||
redisdb.zincrby("mention:napkin", 1, value[1:])
|
||||
continue
|
||||
if token.is_digit:
|
||||
redisdb.zincrby("digit:napkin", 1, value)
|
||||
continue
|
||||
if token.is_space:
|
||||
continue
|
||||
if token.like_url:
|
||||
redisdb.zincrby("url:napkin", 1, value)
|
||||
continue
|
||||
if token.like_email:
|
||||
redisdb.zincrby("email:napkin", 1, value)
|
||||
continue
|
||||
redisdb.zincrby("oov:napkin", 1, value)
|
||||
|
||||
|
||||
for entity in doc.ents:
|
||||
redisdb.zincrby("labels:napkin", 1, entity.label_)
|
||||
|
||||
for anal in analysis:
|
||||
x = redisdb.zrevrange(anal, 1, args.t, withscores=True)
|
||||
print ("# Top {} of {}".format(args.t, anal))
|
||||
for a in x:
|
||||
if args.o == "csv":
|
||||
print ("{},{}".format(a[0],a[1]))
|
||||
print ("#")
|
||||
|
3312
samples/the-prince.txt
Normal file
3312
samples/the-prince.txt
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Reference in a new issue