forked from davidsbatista/StanfordNER-experiments
-
Notifications
You must be signed in to change notification settings - Fork 0
/
stanford_ner.prop
320 lines (216 loc) · 10.3 KB
/
stanford_ner.prop
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
#
# NOTE: most of this features were gathered by looking at the source code, of some files:
#
# https://github.com/stanfordnlp/CoreNLP/blob/master/src/edu/stanford/nlp/ie/NERFeatureFactory.java
# https://github.com/stanfordnlp/CoreNLP/blob/master/src/edu/stanford/nlp/ie/crf/CRFClassifier.java
# https://github.com/stanfordnlp/CoreNLP/blob/master/src/edu/stanford/nlp/sequences/SeqClassifierFlags.java
#
#
#################
# Files locations
#################
# serializeTo = CINTIL-BIO-CRF-model.ser.gz
# trainFile =
# testFile =
# mapping of columns meanings in test/train file
map = word=0,answer=1
########################
# General/Misc. Features
########################
# The number of words on each side of the current word that are included in the disjunction features
disjunctionWidth = 3
# Include in features giving disjunctions of words anywhere in the left or right disjunctionWidth
# words (preserving direction but not position)
useDisjunctive = true
# Gives you feature for w
useWord = true
# Include word and tag pair features
useWordTag = true
# Gives you feature for (pw,c), and together with other options enables other previous features, such as (pt,c) [with useTags)
usePrev = true
# Gives you feature for (nw,c), and together with other options enables other next features, such as (nt,c) [with useTags)
useNext = true
# Gives you features for (t,c), (pt,c) [if usePrev], (nt,c) [if useNext]
useTags = true
# Gives you features for (pw, w, c) and (w, nw, c)
useWordPairs = true
# Gives you features (pt, t, nt, c), (t, nt, c), (pt, t, c)
#useSymTags = false
#useSymWordPairs = false
# Use extra second order class sequence features when previous is CoNLL boundary, so entity knows it can span boundary.
#useBoundarySequences = false
# As an override to whatever other options are in effect, deletes all features other than
# C and CpC clique features when building the classifier
#strictlyFirstOrder = false
# Include in features giving disjunctions of words anywhere in the left or
# right wideDisjunctionWidth words
# (preserving direction but not position)
#useWideDisjunctive = false
#wideDisjunctionWidth = 8
# Use combination of initial position in sentence and class (and word shape)
# as a feature. (Doesn't seem to help.)
#useBeginSent = true
# Iff the prev word is of length 3 or less, add an extra feature that combines
# the word two back and the current word's shape. Weird!
#useLastRealWord = false
#useNextRealWord = false
###########################
# Sequence-related features
###########################
# Does not use any class combination features using previous classes if this is false
#usePrevSequences = true
# Does not use any class combination features using next classes if this is false
#useNextSequences = true
# Use plain higher-order state sequences out to minimum of length or maxLeft
#useLongSequences = false
# Does not use any class combination features if this is false
#useSequences = true
# Use first, second, and third order class and tag sequence interaction features
#useTaggySequences = false
# Add in sequences of tags with just current class features
#useExtraTaggySequences = false
# Add in terms that join sequences of 2 or 3 tags with the current shape
#useTaggySequencesShapeInteraction = false
# Don't extend the range of useTaggySequences when maxLeft is increased.
# dontExtendTaggy = false
########################
# Shape-related features
########################
# Either "none" for no wordShape use, or the name of a word shape function recognized by
# WordShapeClassifier.lookupShaper(String)
#
# https://github.com/stanfordnlp/CoreNLP/blob/master/src/edu/stanford/nlp/process/WordShapeClassifier.java
wordShape = "chris2useLC"
# Include features giving disjunctions of word shapes anywhere in the left or right disjunctionWidth
# words (preserving direction but not position)
useDisjShape = true
# Conjoin shape with tag or position
useShapeConjunctions = true
# Include in features giving disjunctions of words anywhere in the left or right disjunctionWidth
# words (preserving direction but not position) interacting with the word shape of the current word
useDisjunctiveShapeInteraction = false
# Some first order word shape patterns.
# useTypeySequences = true
################
# Entity Schema
################
# If set, convert the labeling of classes (but not the background) into one of several alternate encodings
# (IO, IOB1, IOB2, IOE1, IOE2, SBIEO, with a S(ingle), B(eginning), E(nding), I(nside) 4-way classification for each class.
# By default, we either do no re-encoding, or the CoNLLDocumentIteratorFactory does a lossy encoding as IO.
# Note that this is all CoNLL-specific, and depends on their way of prefix encoding classes, and is only implemented
# by the CoNLLDocumentIteratorFactory.
entitySubclassification = "IOB2"
# If true, rather than undoing a recoding of entity tag subtypes (such as BIO variants), just leave them in the output.
retainEntitySubclassification = true
#########
# N-Grams
#########
# Make features from letter n-grams, i.e., substrings of the word
#useNGrams = false
# Make features from letter n-grams only lowercase
#lowercaseNGrams = false
# Remove hyphens before making features from letter n-grams
#dehyphenateNGrams = false
# Conjoin word shape and n-gram features
#conjoinShapeNGrams = false
# Use letter n-grams for the previous and current words in the CpC clique. This feature helps languages such as Chinese, but not so much for English
#useNeighborNGrams = false
# If true, record the NGram features that correspond to a String (under the current option settings) and reuse rather than recalculating if the String is seen again.
#cacheNGrams = false
# Do not include character n-gram features for n-grams that contain neither the beginning or end of the word
#noMidNGrams = true
# If this number is positive, n-grams above this size will not be used in the model
#maxNGramLeng = -1
##########################
# Gazetteer and Name lists
##########################
useGazettes = true
# one or more filenames (names separated by a comma, semicolon or space).
gazette=resources/DBPedia-pt-per-gazette.txt;resources/DBPedia-pt-loc-gazette.txt;resources/DBPedia-pt-org-gazette.txt
# If true, a gazette feature fires when all tokens of a gazette entry match
cleanGazette = true
# If true, a gazette feature fires when any token of a gazette entry matches
sloppyGazette = false
# undocumented features, but part of the featuresC() method in NERFeatureFactory.java
checkNameList=true
lastNameList=resources/gazettes/lastNames.txt
maleNameList=resources/gazettes/all-first-names.txt
femaleNameList=resources/gazettes/all-first-names.txt
# Match a word against a list of name titles (Mr, Mrs, etc.). Doesn't really seem to help.
# useTitle = true
# Match a word against a better list of English name titles (Mr, Mrs, etc.). Still doesn't really seem to help.
# useTitle2 = true
# This is a very engineered feature designed to capture multiple references to names
useOccurrencePatterns = true
################
# Word Clusters
################
# Load a file of distributional similarity classes (specified by distSimLexicon) and use it for features
useDistSim = true
# Files should be formatted as tab separated rows where each row is a word/class pair. alexclark=word first, terrykoo=class first
# distSimFileFormat = alexclark
# The file to be loaded for distsim classes.
distSimLexicon = resources/word_cluster.txt
# keep word case when reading from the lexicon
casedDistSim = true
#########################
# Optimization parameters
#
# Note:
# - see getMinimizer() method on CRFClassifier.java for much more flags and different optimization methods
# - https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/sequences/SeqClassifierFlags.html
###########################################################################
# parameters for Orthant-Wise Limited-memory Quasi-Newton algorithm (OWL-QN)
useOWLQN = true
# L1-prior used in QNMinimizer's OWLQN
priorLambda = 0.1
# terminate on maximum number of iteration
maxQNItr = 125
# Convergence tolerance in optimization
tolerance = 1e-4
###########################################
# parameters for Stochastic Gradient Descent
#useInPlaceSGD = false
# If this number is greater than 0, specifies the number of SGD passes over entire training set) to
# do before giving up (default is 50). Can be smaller if sample size is very large.
#SGDPasses = 100
# If this number is greater than 0, specifies the number of samples to use for tuning (default is 1000).
#tuneSampleSize = 1000
# ???
#sigma = 20
# Stochastic Batch Gradient Descent Size
#stochasticBatchSize = 15
# Use SGD (SGD version selected by useInPlaceSGD or useSGD) for a certain number of passes (SGDPasses)
# and then switches to QN. Gives the quick initial convergence of SGD, with the desired convergence
# criterion of QN (there is some ramp up time for QN). NOTE: Remember to set useQN to false
#useSGDtoQN = false
#useQN = true
#######################################
# Applies only to English (I suppose..)
#######################################
# If this is true, capitalization of day and month names is normalized to lowercase
#normalizeTerms = false
# If this is true, capitalization of day and month names is normalized to lowercase
#normalizeTimex = false
# Include the lemma of a word as a feature.
#useLemmas = false
# Include the previous/next lemma of a word as a feature.
#usePrevNextLemmas = false
# Include the lemma of a word as a feature.
#useLemmaAsWord = false
########################################
# Save features and extra stuff to file
########################################
# A parameter to the printing style, which may give, for example the number of parameters to print
#printClassifierParam = 100
# Print out all feature/class pairs and their weight, and then for each input data point,
# print justification (weights) for active features
#justify = true
# Print out all the features generated by the classifier for a dataset to a file based on this
# name (starting with "features-", suffixed "-1" and "-2" for train and test).
# This simply prints the feature names, one per line.
#printFeatures = portuguese
# Print out features for only the first this many datums, if the value is positive.
#printFeaturesUpto = -1
# Style in which to print the classifier. One of: HighWeight, HighMagnitude, Collection, AllWeights, WeightHistogram
#printClassifier = "AllWeights"