forked from boudinfl/pke
-
Notifications
You must be signed in to change notification settings - Fork 0
/
compute-df-counts.py
30 lines (23 loc) · 1004 Bytes
/
compute-df-counts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# -*- coding: utf-8 -*-
import logging
import sys
from string import punctuation
from pke import compute_document_frequency
# setting info in terminal
logging.basicConfig(level=logging.INFO)
# path to the collection of documents
input_dir = sys.argv[1]
# path to the df weights dictionary, saved as a gzipped csv file
output_file = sys.argv[2]
# stoplist are punctuation marks
stoplist = list(punctuation)
stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
# compute idf weights
compute_document_frequency(input_dir=input_dir,
output_file=output_file,
extension='xml', # input file extension
language='en', # language of the input files
normalization="stemming", # use porter stemmer
stoplist=stoplist, # stoplist
delimiter='\t', # tab separated output
n=5) # compute n-grams up to 5-grams