Merge pull request #60 from BackofenLab/workJannis

Merge meilisearch to master
BackofenLab · Feb 22, 2024 · 0985873 · 0985873
2 parents a09a197 + e5be8c0
commit 0985873
Show file tree

Hide file tree

Showing 7 changed files with 887 additions and 0 deletions.
diff --git a/backend/src/meilisearch/config.toml b/backend/src/meilisearch/config.toml
@@ -0,0 +1,131 @@
+# This file shows the default configuration of Meilisearch.
+# All variables are defined here: https://www.meilisearch.com/docs/learn/configuration/instance_options#environment-variables
+
+# Designates the location where database files will be created and retrieved.
+# https://www.meilisearch.com/docs/learn/configuration/instance_options#database-path
+db_path = "./data.ms"
+
+# Configures the instance's environment. Value must be either `production` or `development`.
+# https://www.meilisearch.com/docs/learn/configuration/instance_options#environment
+env = "development"
+
+# The address on which the HTTP server will listen.
+http_addr = "localhost:7700"
+
+# Sets the instance's master key, automatically protecting all routes except GET /health.
+# https://www.meilisearch.com/docs/learn/configuration/instance_options#master-key
+master_key = "aSampleMasterKey2"
+
+# Deactivates Meilisearch's built-in telemetry when provided.
+# Meilisearch automatically collects data from all instances that do not opt out using this flag.
+# All gathered data is used solely for the purpose of improving Meilisearch, and can be deleted at any time.
+# https://www.meilisearch.com/docs/learn/configuration/instance_options#disable-analytics
+# no_analytics = true
+
+# Sets the maximum size of accepted payloads.
+# https://www.meilisearch.com/docs/learn/configuration/instance_options#payload-limit-size
+http_payload_size_limit = "550 MB"
+
+# Defines how much detail should be present in Meilisearch's logs.
+# Meilisearch currently supports six log levels, listed in order of increasing verbosity:  `OFF`, `ERROR`, `WARN`, `INFO`, `DEBUG`, `TRACE`
+# https://www.meilisearch.com/docs/learn/configuration/instance_options#log-level
+log_level = "INFO"
+
+# Sets the maximum amount of RAM Meilisearch can use when indexing.
+# https://www.meilisearch.com/docs/learn/configuration/instance_options#max-indexing-memory
+# max_indexing_memory = "12 GiB"
+
+# Sets the maximum number of threads Meilisearch can use during indexing.
+# https://www.meilisearch.com/docs/learn/configuration/instance_options#max-indexing-threads
+# max_indexing_threads = 10
+
+#############
+### DUMPS ###
+#############
+
+# Sets the directory where Meilisearch will create dump files.
+# https://www.meilisearch.com/docs/learn/configuration/instance_options#dump-directory
+dump_dir = "dumps/"
+
+# Imports the dump file located at the specified path. Path must point to a .dump file.
+# https://www.meilisearch.com/docs/learn/configuration/instance_options#import-dump
+# import_dump = "./path/to/my/file.dump"
+
+# Prevents Meilisearch from throwing an error when `import_dump` does not point to a valid dump file.
+# https://www.meilisearch.com/docs/learn/configuration/instance_options#ignore-missing-dump
+ignore_missing_dump = false
+
+# Prevents a Meilisearch instance with an existing database from throwing an error when using `import_dump`.
+# https://www.meilisearch.com/docs/learn/configuration/instance_options#ignore-dump-if-db-exists
+ignore_dump_if_db_exists = false
+
+
+#################
+### SNAPSHOTS ###
+#################
+
+# Enables scheduled snapshots when true, disable when false (the default).
+# If the value is given as an integer, then enables the scheduled snapshot with the passed value as the interval
+# between each snapshot, in seconds.
+# https://www.meilisearch.com/docs/learn/configuration/instance_options#schedule-snapshot-creation
+schedule_snapshot = false
+
+# Sets the directory where Meilisearch will store snapshots.
+# https://www.meilisearch.com/docs/learn/configuration/instance_options#snapshot-destination
+snapshot_dir = "snapshots/"
+
+# Launches Meilisearch after importing a previously-generated snapshot at the given filepath.
+# https://www.meilisearch.com/docs/learn/configuration/instance_options#import-snapshot
+# import_snapshot = "./path/to/my/snapshot"
+
+# Prevents a Meilisearch instance from throwing an error when `import_snapshot` does not point to a valid snapshot file.
+# https://www.meilisearch.com/docs/learn/configuration/instance_options#ignore-missing-snapshot
+ignore_missing_snapshot = false
+
+# Prevents a Meilisearch instance with an existing database from throwing an error when using `import_snapshot`.
+# https://www.meilisearch.com/docs/learn/configuration/instance_options#ignore-snapshot-if-db-exists
+ignore_snapshot_if_db_exists = false
+
+
+###########
+### SSL ###
+###########
+
+# Enables client authentication in the specified path.
+# https://www.meilisearch.com/docs/learn/configuration/instance_options#ssl-authentication-path
+# ssl_auth_path = "./path/to/root"
+
+# Sets the server's SSL certificates.
+# https://www.meilisearch.com/docs/learn/configuration/instance_options#ssl-certificates-path
+# ssl_cert_path = "./path/to/certfile"
+
+# Sets the server's SSL key files.
+# https://www.meilisearch.com/docs/learn/configuration/instance_options#ssl-key-path
+# ssl_key_path = "./path/to/private-key"
+
+# Sets the server's OCSP file.
+# https://www.meilisearch.com/docs/learn/configuration/instance_options#ssl-ocsp-path
+# ssl_ocsp_path = "./path/to/ocsp-file"
+
+# Makes SSL authentication mandatory.
+# https://www.meilisearch.com/docs/learn/configuration/instance_options#ssl-require-auth
+ssl_require_auth = false
+
+# Activates SSL session resumption.
+# https://www.meilisearch.com/docs/learn/configuration/instance_options#ssl-resumption
+ssl_resumption = false
+
+# Activates SSL tickets.
+# https://www.meilisearch.com/docs/learn/configuration/instance_options#ssl-tickets
+ssl_tickets = false
+
+#############################
+### Experimental features ###
+#############################
+
+# Experimental metrics feature. For more information, see: <https://github.com/meilisearch/meilisearch/discussions/3518>
+# Enables the Prometheus metrics on the `GET /metrics` endpoint.
+experimental_enable_metrics = false
+
+# Experimental RAM reduction during indexing, do not use in production, see: <https://github.com/meilisearch/product/discussions/652>
+experimental_reduce_indexing_memory_usage = false
diff --git a/backend/src/meilisearch/csv_to_json.py b/backend/src/meilisearch/csv_to_json.py
@@ -0,0 +1,68 @@
+import json
+from ast import literal_eval
+import pandas as pd
+
+
+def csv_to_json():
+    """
+    Transforms one csv file into (multiple) json files
+
+    Make sure the csv file comes in the right format and that the allocation of attributes
+    matches the code and your file.
+    For more information on the expected formate check the
+    meilisearch documentation.
+
+    Make sure the output files are not bigger than meilisearch max_payload!
+    """
+
+    input_file = str(input("Please give the input file: "))
+    filesize = int(input("How many abstracts should be saved in one file? (recommended ~ 100k): "))
+    print(f"Loading {input_file} ...")
+    df = pd.read_csv(input_file, on_bad_lines="warn")
+
+    listofdata = []
+    y = 0  # used to create different filenames
+
+    print(f"Processing a total of {len(df)} abstracts...")
+
+    for x in range(len(df)):  # going through every line in our file
+        if (x % filesize == 0) and (x != 0):  # split into several output files
+            filename = f"Pubmed{y}.json"
+            with open(filename, "w", encoding="utf-8") as outfile:
+                json_object = json.dumps(listofdata, indent=4)
+                outfile.write(json_object)
+
+            y += 1
+            print(f"{y} / {len(df) // filesize + 1}...")  # print status
+            listofdata = []
+
+        # retriving attributes from file, make sure they are json compatible
+        pubmed_id = str(df.iloc[x, 0])
+        title = df.iloc[x, 1]
+        abstract = df.iloc[x, 2]
+        cited_by = df.iloc[x, 3]
+        cited_number = str(len(literal_eval(cited_by)))
+        published_year = str(df.iloc[x, 4])
+
+        dictionary = {
+            "PubMed ID": pubmed_id,
+            "Title": title,
+            "Abstract": abstract,
+            "Cited by": cited_by,
+            "Cited number": cited_number,
+            "Published": published_year,
+        }
+
+        listofdata.append(dictionary)
+
+    # save the last last data into the last file
+    filename = f"Pubmed{y}.json"
+    with open(filename, "w", encoding="utf-8") as outfile:
+        json_object = json.dumps(listofdata, indent=4)
+        outfile.write(json_object)
+
+    print("Done")
+
+
+if __name__ == "__main__":
+    csv_to_json()
diff --git a/backend/src/meilisearch/meilisearch_add_data.py b/backend/src/meilisearch/meilisearch_add_data.py
@@ -0,0 +1,94 @@
+import json
+import time
+import Api_key
+import meilisearch
+
+
+def add_data(client):
+    """
+    Uploads the given files to meilisearch
+
+    Takes any number of json files and uploads them to meilisearch.
+    Please make sure you follow the naming conventions. If you want to upload multiple files
+    give them the same name with ascending number after the name - starting at 0
+
+    Example: upload_to_db_0.json, upload_to_db_1.json, upload_to_db_2.json,....
+
+    The upload speed depends on the size of your files, the program crashes without error if the
+    size of one of your documents is bigger than the max payload limit set in meilisearch.
+    Recommended size < 350MB
+
+    """
+
+    number_files = int(input("How many files do you want to upload?: "))
+
+    if number_files > 1:
+        print(
+            "\nPlease make sure you files are named correct -"
+            "Same name, ascending numbers at the end starting with 0:"
+            "\ntest1.json, test2.json, test3.json ... \n"
+        )
+        input_file = str(
+            input(
+                "Please only give the name without .json ending and number \
+                               \nlike this: 'test' for test0.json: "
+            )
+        )
+        multiple_files = True
+    elif number_files == 1:
+        input_file = str(input("Please give the input file you would like to upload: "))
+        multiple_files = False
+    else:
+        return
+
+    index = str(input("What index do you want the data to be added to? "))
+    print("Processing, this may take a while...")
+
+    for x in range(number_files):
+        if multiple_files:
+            filename = input_file + str(x) + ".json"
+        else:
+            filename = input_file
+
+        # open the file with out data - has to be json!!!
+        with open(filename, "r", encoding="utf-8") as json_file:
+            data = json.load(json_file)
+
+        print("Loading: ", filename)
+
+        task = client.index(index).add_documents(data)
+        task_id = task.task_uid
+
+        while task.status == "enqueued" or task.status == "processing":
+            time.sleep(10)  # we use sleep to avoid spamming the server with status requests
+            task = client.get_task(task_id)
+
+        if task.status == "failed":
+            print("Failed to upload to meilisearch, check meilisearch console")
+            return
+
+        if x + 1 < number_files:
+            print(f"{x+1} / {number_files} - {filename} uploaded successfully")
+        else:
+            print(f"Done - {filename} uploaded successfully")
+
+
+def delete_index(client):
+    """Deletes the given index"""
+    index = str(input("What index do you want to delete? "))
+    client.delete_index(index)
+
+
+def main():
+    """docstring"""
+    # setup the client connection to the database using the master API key to identify us
+    client = meilisearch.Client("http://localhost:7700", Api_key.ADMIN_API_KEY)
+
+    add_data(client)
+
+    # Uncomment this to delte an index
+    # delete_index(client)
+
+
+if __name__ == "__main__":
+    main()