-
Notifications
You must be signed in to change notification settings - Fork 242
/
docs-scraper.config.json
71 lines (71 loc) · 2.35 KB
/
docs-scraper.config.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
{
"index_uid": "production",
"start_urls": [
"https://www.meilisearch.com/docs"
],
"sitemap_urls": [
"https://www.meilisearch.com/sitemap.xml"
],
"selectors": {
"lvl0": {
"selector": "#sidebar-list .sidebar-link[data-active=true] ~ .sidebar-category",
"global": true,
"default_value": "Documentation"
},
"lvl1": {
"selector": ".docs-main h1",
"global": true
},
"lvl2": ".docs-main h2",
"lvl3": ".docs-main h3",
"lvl4": ".docs-main h4",
"lvl5": ".docs-main h5",
"text": ".docs-main p, .docs-main li, .docs-main td"
},
"strip_chars": " .,;:#",
"scrap_start_urls": true,
"custom_settings": {
"embedders": {
"default": {
"source": "openAi",
"apiKey": "embedderApiKey",
"model": "text-embedding-ada-002",
"documentTemplate": "{% if doc.hierarchy_lvl0 != null %}Starting with the theme '{{doc.hierarchy_lvl0}}'. {% endif %}{% if doc.hierarchy_lvl1 != null %}A document titled '{{doc.hierarchy_lvl1}}'. {% endif %}{% if doc.hierarchy_lvl2 != null %}Under the section '{{doc.hierarchy_lvl2}}'. {% endif %}{% if doc.hierarchy_lvl3 != null %}This is further divided into '{{doc.hierarchy_lvl3}}'. {% endif %}{% if doc.hierarchy_lvl4 != null %}Which includes '{{doc.hierarchy_lvl4}}'. {% endif %}{% if doc.hierarchy_lvl5 != null %}Specifically focusing on '{{doc.hierarchy_lvl5}}'. {% endif %}{% if doc.anchor != null %}It's about {{doc.anchor}}. {% endif %}{% if doc.content != null %}It discusses {{doc.content|truncatewords: 200}}{% endif %}"
}
},
"searchableAttributes": [
"hierarchy_lvl1",
"hierarchy_lvl2",
"hierarchy_lvl3",
"hierarchy_lvl4",
"hierarchy_lvl5",
"content",
"hierarchy_lvl0"
],
"synonyms": {
"large language model": [
"llm"
],
"llm": [
"large language model"
],
"relevancy": [
"relevant",
"relevance"
],
"relevant": [
"relevancy",
"relevance"
],
"relevance": [
"relevancy",
"relevant"
]
}
},
"min_indexed_level": 2,
"selectors_exclude": [
"#page-navigation",
"#feedback"
]
}