fix: address bradens comments

openedx · Nov 28, 2024 · 73957b6 · 73957b6
1 parent de1d918
commit 73957b6
Show file tree

Hide file tree

Showing 2 changed files with 126 additions and 72 deletions.
diff --git a/openedx/core/djangoapps/content/search/api.py b/openedx/core/djangoapps/content/search/api.py
@@ -62,6 +62,65 @@
 
 EXCLUDED_XBLOCK_TYPES = ['course', 'course_info']
 
+INDEX_DISTINCT_ATTRIBUTE = "usage_key"
+INDEX_FILTRABLE_ATTRIBUTES = [
+    # Get specific block/collection using combination of block_id and context_key
+    Fields.block_id,
+    Fields.block_type,
+    Fields.context_key,
+    Fields.usage_key,
+    Fields.org,
+    Fields.tags,
+    Fields.tags + "." + Fields.tags_taxonomy,
+    Fields.tags + "." + Fields.tags_level0,
+    Fields.tags + "." + Fields.tags_level1,
+    Fields.tags + "." + Fields.tags_level2,
+    Fields.tags + "." + Fields.tags_level3,
+    Fields.collections,
+    Fields.collections + "." + Fields.collections_display_name,
+    Fields.collections + "." + Fields.collections_key,
+    Fields.type,
+    Fields.access_id,
+    Fields.last_published,
+    Fields.content + "." + Fields.problem_types,
+]
+INDEX_SEARCHABLE_ATTRIBUTES = [
+    # Keyword search does _not_ search the course name, course ID, breadcrumbs, block type, or other fields.
+    Fields.display_name,
+    Fields.block_id,
+    Fields.content,
+    Fields.description,
+    Fields.tags,
+    Fields.collections,
+    # If we don't list the following sub-fields _explicitly_, they're only sometimes searchable - that is, they
+    # are searchable only if at least one document in the index has a value. If we didn't list them here and,
+    # say, there were no tags.level3 tags in the index, the client would get an error if trying to search for
+    # these sub-fields: "Attribute `tags.level3` is not searchable."
+    Fields.tags + "." + Fields.tags_taxonomy,
+    Fields.tags + "." + Fields.tags_level0,
+    Fields.tags + "." + Fields.tags_level1,
+    Fields.tags + "." + Fields.tags_level2,
+    Fields.tags + "." + Fields.tags_level3,
+    Fields.collections + "." + Fields.collections_display_name,
+    Fields.collections + "." + Fields.collections_key,
+    Fields.published + "." + Fields.display_name,
+    Fields.published + "." + Fields.published_description,
+]
+INDEX_SORTABLE_ATTRIBUTES = [
+    Fields.display_name,
+    Fields.created,
+    Fields.modified,
+    Fields.last_published,
+]
+INDEX_RANKING_RULES = [
+    "sort",
+    "words",
+    "typo",
+    "proximity",
+    "attribute",
+    "exactness",
+]
+
 
 @contextmanager
 def _index_rebuild_lock() -> Generator[str, None, None]:
@@ -217,6 +276,18 @@ def _using_temp_index(status_cb: Callable[[str], None] | None = None) -> Generat
         _wait_for_meili_task(client.delete_index(temp_index_name))
 
 
+def _index_is_empty(index_name: str) -> bool:
+    """
+    Check if an index is empty
+
+    Args:
+        index_name (str): The name of the index to check
+    """
+    client = _get_meilisearch_client()
+    index = client.get_index(index_name)
+    return index.get_stats().number_of_documents == 0
+
+
 def _configure_index(index_name):
     """
     Configure the index. The following index settings are best changed on an empty index.
@@ -228,78 +299,17 @@ def _configure_index(index_name):
     client = _get_meilisearch_client()
 
     # Mark usage_key as unique (it's not the primary key for the index, but nevertheless must be unique):
-    client.index(index_name).update_distinct_attribute(Fields.usage_key)
+    client.index(index_name).update_distinct_attribute(INDEX_DISTINCT_ATTRIBUTE)
     # Mark which attributes can be used for filtering/faceted search:
-    client.index(index_name).update_filterable_attributes(
-        [
-            # Get specific block/collection using combination of block_id and context_key
-            Fields.block_id,
-            Fields.block_type,
-            Fields.context_key,
-            Fields.usage_key,
-            Fields.org,
-            Fields.tags,
-            Fields.tags + "." + Fields.tags_taxonomy,
-            Fields.tags + "." + Fields.tags_level0,
-            Fields.tags + "." + Fields.tags_level1,
-            Fields.tags + "." + Fields.tags_level2,
-            Fields.tags + "." + Fields.tags_level3,
-            Fields.collections,
-            Fields.collections + "." + Fields.collections_display_name,
-            Fields.collections + "." + Fields.collections_key,
-            Fields.type,
-            Fields.access_id,
-            Fields.last_published,
-            Fields.content + "." + Fields.problem_types,
-        ]
-    )
+    client.index(index_name).update_filterable_attributes(INDEX_FILTRABLE_ATTRIBUTES)
     # Mark which attributes are used for keyword search, in order of importance:
-    client.index(index_name).update_searchable_attributes(
-        [
-            # Keyword search does _not_ search the course name, course ID, breadcrumbs, block type, or other fields.
-            Fields.display_name,
-            Fields.block_id,
-            Fields.content,
-            Fields.description,
-            Fields.tags,
-            Fields.collections,
-            # If we don't list the following sub-fields _explicitly_, they're only sometimes searchable - that is, they
-            # are searchable only if at least one document in the index has a value. If we didn't list them here and,
-            # say, there were no tags.level3 tags in the index, the client would get an error if trying to search for
-            # these sub-fields: "Attribute `tags.level3` is not searchable."
-            Fields.tags + "." + Fields.tags_taxonomy,
-            Fields.tags + "." + Fields.tags_level0,
-            Fields.tags + "." + Fields.tags_level1,
-            Fields.tags + "." + Fields.tags_level2,
-            Fields.tags + "." + Fields.tags_level3,
-            Fields.collections + "." + Fields.collections_display_name,
-            Fields.collections + "." + Fields.collections_key,
-            Fields.published + "." + Fields.display_name,
-            Fields.published + "." + Fields.published_description,
-        ]
-    )
+    client.index(index_name).update_searchable_attributes(INDEX_SEARCHABLE_ATTRIBUTES)
     # Mark which attributes can be used for sorting search results:
-    client.index(index_name).update_sortable_attributes(
-        [
-            Fields.display_name,
-            Fields.created,
-            Fields.modified,
-            Fields.last_published,
-        ]
-    )
+    client.index(index_name).update_sortable_attributes(INDEX_SORTABLE_ATTRIBUTES)
 
     # Update the search ranking rules to let the (optional) "sort" parameter take precedence over keyword relevance.
     # cf https://www.meilisearch.com/docs/learn/core_concepts/relevancy
-    client.index(index_name).update_ranking_rules(
-        [
-            "sort",
-            "words",
-            "typo",
-            "proximity",
-            "attribute",
-            "exactness",
-        ]
-    )
+    client.index(index_name).update_ranking_rules(INDEX_RANKING_RULES)
 
 
 def _recurse_children(block, fn, status_cb: Callable[[str], None] | None = None) -> None:
@@ -378,6 +388,32 @@ def reset_index(status_cb: Callable[[str], None] | None = None) -> None:
     status_cb("Index reset complete.")
 
 
+def _is_index_configured(index_name: str) -> bool:
+    """
+    Check if an index is completely configured
+
+    Args:
+        index_name (str): The name of the index to check
+    """
+    client = _get_meilisearch_client()
+    index = client.get_index(index_name)
+    settings = index.get_settings()
+    for k, v in (
+        ("distinctAttribute", INDEX_DISTINCT_ATTRIBUTE),
+        ("filterableAttributes", INDEX_FILTRABLE_ATTRIBUTES),
+        ("searchableAttributes", INDEX_SEARCHABLE_ATTRIBUTES),
+        ("sortableAttributes", INDEX_SORTABLE_ATTRIBUTES),
+        ("rankingRules", INDEX_RANKING_RULES),
+    ):
+        setting = settings.get(k, [])
+        if isinstance(v, list):
+            v = set(v)
+            setting = set(setting)
+        if setting != v:
+            return False
+    return True
+
+
 def init_index(status_cb: Callable[[str], None] | None = None, warn_cb: Callable[[str], None] | None = None) -> None:
     """
     Initialize the Meilisearch index, creating it and configuring it if it doesn't exist
@@ -388,10 +424,19 @@ def init_index(status_cb: Callable[[str], None] | None = None, warn_cb: Callable
         warn_cb = log.warning
 
     if _index_exists(STUDIO_INDEX_NAME):
-        warn_cb(
-            "A rebuild of the index is required. Please run ./manage.py cms reindex_studio"
-            " --experimental [--incremental]"
-        )
+        if _index_is_empty(STUDIO_INDEX_NAME):
+            warn_cb(
+                "The studio search index is empty. Please run ./manage.py cms reindex_studio"
+                " --experimental [--incremental]"
+            )
+            return
+        if not _is_index_configured(STUDIO_INDEX_NAME):
+            warn_cb(
+                "A rebuild of the index is required. Please run ./manage.py cms reindex_studio"
+                " --experimental [--incremental]"
+            )
+            return
+        status_cb("Index already exists and is configured.")
         return
 
     reset_index(status_cb)
@@ -424,8 +469,9 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None, incremental=Fa
     num_courses = CourseOverview.objects.count()
 
     # Some counters so we can track our progress as indexing progresses:
-    num_contexts = num_courses + num_libraries
-    num_contexts_done = 0  # How many courses/libraries we've indexed
+    num_libs_skipped = len(keys_indexed)
+    num_contexts = num_courses + num_libraries + num_libs_skipped
+    num_contexts_done = 0 + num_libs_skipped  # How many courses/libraries we've indexed
     num_blocks_done = 0  # How many individual components/XBlocks we've indexed
 
     status_cb(f"Found {num_courses} courses, {num_libraries} libraries.")

diff --git a/openedx/core/djangoapps/content/search/tests/test_api.py b/openedx/core/djangoapps/content/search/tests/test_api.py
@@ -294,6 +294,14 @@ def test_reset_meilisearch_index(self, mock_meilisearch):
 
     @override_settings(MEILISEARCH_ENABLED=True)
     def test_init_meilisearch_index(self, mock_meilisearch):
+        # Test index already exists
+        api.init_index()
+        mock_meilisearch.return_value.swap_indexes.assert_not_called()
+        mock_meilisearch.return_value.create_index.assert_not_called()
+        mock_meilisearch.return_value.delete_index.assert_not_called()
+
+        # Test index already exists and has no documents
+        mock_meilisearch.return_value.get_stats.return_value = 0
         api.init_index()
         mock_meilisearch.return_value.swap_indexes.assert_not_called()
         mock_meilisearch.return_value.create_index.assert_not_called()