forked from Brown-University-Library/iip-texts
-
Notifications
You must be signed in to change notification settings - Fork 1
/
solr-schema.xml
228 lines (193 loc) · 12.8 KB
/
solr-schema.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
<?xml version="1.0" encoding="UTF-8"?>
<!-- Initially created by Carlos Rotger on 4/8/14 -->
<schema name="iip" version="1.0">
<!--
Defines the types of fields associated with this schema with a series of:
<fieldType name="____" class="____" ...> ... </fieldtype>
statements.
@name is just a label, @class is the actual Solr datatype for the field.
Each fieldType can have up to two <analyzer> children that define how a field is analyzed before being searched for/inserted.
The analyzer with @type=index is applied to incoming field values before they are indexed.
The analyzer with @type=query is applied to search terms before they are used to find elements in the index.
If the fieldType only has one analyzer with no @type, it is applied to both.
Analyzer elements contain <filter> elements, <charFilter> elements, and one <tokenizer> element, both with an @class that
defines the Java class that runs the filter or tokenizer, and other attributes that serve as arguments to that class.
Filters change the actual data coming in (strip HTML, change accents, etc.), charFilters change individual characters,
and tokenizers break data into discrete search tokens.
-->
<types>
<!-- Taken from the original IIP schema -->
<fieldType name="text" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<!-- in this example, we will only use synonyms at query time
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
-->
<!-- Case insensitive stop word removal.
add enablePositionIncrements=true in both the index and query
analyzers to leave a 'gap' for more accurate phrase queries.
-->
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"
enablePositionIncrements="true"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1"
generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"
splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<!-- Snowball algorithm for stemming English words -->
<filter class="solr.SnowballPorterFilterFactory" language="English"
protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
<charFilter class="solr.MappingCharFilterFactory" mapping="greek_accent_cleaning.txt"/>
<filter class="solr.GreekLowerCaseFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true"
expand="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"
enablePositionIncrements="true"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1"
generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"
splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<!-- Remove greek accents from full-text queries -->
<charFilter class="solr.MappingCharFilterFactory" mapping="greek_accent_cleaning.txt"/>
<filter class="solr.GreekLowerCaseFilterFactory"/>
<!-- Snowball algorithm for stemming English words -->
<filter class="solr.SnowballPorterFilterFactory" language="English"
protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
<!-- Taken from original IIP schema -->
<!--
The StrField type is not analyzed, but indexed/stored verbatim.
- StrField and TextField support an optional compressThreshold which
limits compression (if enabled in the derived fields) to values which
exceed a certain size (in characters).
-->
<fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
<fieldType name="stringSplit" class="solr.TextField" sortMissingLast="true">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory" />
<filter class="solr.PatternReplaceFilterFactory" pattern="[#]*" replacement="" replace="all"/>
</analyzer>
</fieldType>
<fieldType name="text_transcription" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<!-- Split tokens over whitespace -->
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<!--<!-\- Use the synonyms listed in synonyms.txt -\->
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true"
expand="true"/>
<!-\- Filter to ignore common or meaningless words, e.g. "a", "the", "or", etc., as listed in stopwords.txt -\->
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"
enablePositionIncrements="true"/>-->
<!-- Breaks constructions like "hello-world" and "HelloWorld" into "hello" and "world"
This one is set to break on word parts and case changes mid-word (e.g. hello-world
or HelloWorld both become two tokens hello and world), number parts ("100-25" to
"100" and "25"), and is set to only index the separate tokens, not their concatenations,
i.e. "hello-world" is indexed as just "hello" and "world", and not additionally as "helloworld".
-->
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1"
generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"
splitOnCaseChange="1"/>
<filter class="solr.SnowballPorterFilterFactory" language="English"
protected="protwords.txt"/>
<!-- Makes all the input lower-case -->
<filter class="solr.LowerCaseFilterFactory"/>
<!-- removes duplicate tokens -->
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
<charFilter class="solr.MappingCharFilterFactory" mapping="greek_accent_cleaning.txt"/>
<filter class="solr.GreekLowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0"/>
<fieldType name="int" class="solr.TrieIntField" />
<fieldType name="geoloc" class="solr.SpatialRecursivePrefixTreeFieldType"
distErrPct="0.025"
maxDistErr="0.000009"
units="degrees"
/>
</types>
<!--
Defines a set of fields in the index with <field> statements. Fairly straightforward.
Taken from the original IIP schema, but transcription_search is changed to being of type "text_transcription"
-->
<fields>
<field name="_version_" type="long" indexed="true" stored="true" multiValued="false"/>
<!-- general -->
<field name="inscription_id" type="string" indexed="true" stored="true" multiValued="false"
required="true"/>
<!--<field name="place" type="text" indexed="true" stored="true" multiValued="true" />-->
<field name="region" type="string" indexed="true" stored="true" multiValued="false"/>
<field name="city" type="string" indexed="true" stored="true" multiValued="false"/>
<field name="notAfter" type="int" indexed="true" stored="true" multiValued="false"/>
<field name="notBefore" type="int" indexed="true" stored="true" multiValued="false"/>
<field name="type" type="stringSplit" indexed="true" stored="true" multiValued="true"/>
<field name="language" type="stringSplit" indexed="true" stored="true" multiValued="true"/>
<field name="language_display" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="religion" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="religion_display" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="physical_type" type="stringSplit" indexed="true" stored="true" multiValued="true"/>
<field name="material" type="stringSplit" indexed="true" stored="true" multiValued="true"/>
<field name="figure_desc" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="figure" type="text" indexed="true" stored="true" multiValued="true"/>
<field name="place_found" type="text" indexed="true" stored="true" multiValued="false"/>
<field name="transcription" type="string" stored="true" multiValued="false"/>
<field name="transcription_search" type="text_transcription" indexed="true" stored="false" multiValued="false"/>
<field name="translation" type="string" stored="true" multiValued="false"/>
<field name="translation_search" type="text" indexed="true" stored="false" multiValued="false"/>
<field name="diplomatic" type="string" stored="true" multiValued="false"/>
<field name="short_description" type="text" indexed="true" stored="true" multiValued="false"/>
<field name="description" type="text" indexed="true" stored="true" multiValued="false"/>
<field name="dimensions" type="string" indexed="false" stored="true" multiValued="false"/>
<field name="bibl" type="string" indexed="false" stored="true" multiValued="true"/>
<field name="biblDiplomatic" type="string" indexed="false" stored="true" multiValued="true"/>
<field name="biblTranscription" type="string" indexed="false" stored="true" multiValued="true"/>
<field name="biblTranslation" type="string" indexed="false" stored="true" multiValued="true"/>
<field name="placeMenu" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="image" type="string" indexed="false" stored="true" multiValued="true"/>
<field name="imageSource" type="string" indexed="false" stored="true" multiValued="true"/>
<!--For status facet; possible values: [ 'to_approve', 'approved', 'to_correct' ]-->
<field name="display_status" type="string" indexed="true" stored="true" multiValued="false"/>
<!--Catch all full-text indexing fields-->
<field name="text" type="text_transcription" indexed="true" stored="true" omitNorms="false" required="false"
multiValued="true"/>
<field name="metadata" type="text" indexed="true" stored="true" omitNorms="false"
required="false" multiValued="true"/>
<field name="place" type="text" indexed="true" stored="true" omitNorms="false" required="false"
multiValued="true"/>
<dynamicField name="*_geo" type="geoloc" indexed="true" stored="true" multiValued="true" required="false"/>
<dynamicField name="*_pleiades" type="string" indexed="true" stored="true" multiValued="false" required="false"/>
<dynamicField name="*" type="string" indexed="true" stored="true" multiValued="true" required="false"/>
</fields>
<!-- Defines which field is the primary, unique key in the index -->
<uniqueKey>inscription_id</uniqueKey>
<!-- Defines the default field to search by when no field is specified -->
<defaultSearchField>text</defaultSearchField>
<!-- Defines the default operator between tokens in searches-->
<solrQueryParser defaultOperator="AND"/>
<!-- copyField commands copy one field to another at the time a document
is added to the index. It's used either to index the same field differently,
or to add multiple fields to the same field for easier/faster searching. -->
<!-- <copyField source="figure" dest="text"/>-->
<!--Copy to metadata field for metadata string search-->
<!--<copyField source="place" dest="metadata"/>-->
<copyField source="type" dest="metadata"/>
<copyField source="material" dest="metadata"/>
<copyField source="place_found" dest="metadata"/>
<copyField source="region" dest="metadata"/>
<copyField source="city" dest="metadata"/>
<copyField source="short_description" dest="metadata"/>
<copyField source="description" dest="metadata"/>
<copyField source="inscription_id" dest="metadata"/>
<!--Copy to text field for transcription/translation search-->
<copyField source="transcription_search" dest="text"/>
<copyField source="translation_search" dest="text"/>
<copyField source="diplomatic" dest="text"/>
<!--Copy to places field for place search-->
<copyField source="region" dest="place"/>
<copyField source="city" dest="place"/>
<copyField source="placeMenu" dest="place"/>
</schema>