Skip to content

Commit

Permalink
Merge pull request #60 from BackofenLab/workJannis
Browse files Browse the repository at this point in the history
Merge meilisearch to master
  • Loading branch information
JannisHaeffner authored Feb 22, 2024
2 parents a09a197 + e5be8c0 commit 0985873
Show file tree
Hide file tree
Showing 7 changed files with 887 additions and 0 deletions.
131 changes: 131 additions & 0 deletions backend/src/meilisearch/config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
# This file shows the default configuration of Meilisearch.
# All variables are defined here: https://www.meilisearch.com/docs/learn/configuration/instance_options#environment-variables

# Designates the location where database files will be created and retrieved.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#database-path
db_path = "./data.ms"

# Configures the instance's environment. Value must be either `production` or `development`.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#environment
env = "development"

# The address on which the HTTP server will listen.
http_addr = "localhost:7700"

# Sets the instance's master key, automatically protecting all routes except GET /health.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#master-key
master_key = "aSampleMasterKey2"

# Deactivates Meilisearch's built-in telemetry when provided.
# Meilisearch automatically collects data from all instances that do not opt out using this flag.
# All gathered data is used solely for the purpose of improving Meilisearch, and can be deleted at any time.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#disable-analytics
# no_analytics = true

# Sets the maximum size of accepted payloads.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#payload-limit-size
http_payload_size_limit = "550 MB"

# Defines how much detail should be present in Meilisearch's logs.
# Meilisearch currently supports six log levels, listed in order of increasing verbosity: `OFF`, `ERROR`, `WARN`, `INFO`, `DEBUG`, `TRACE`
# https://www.meilisearch.com/docs/learn/configuration/instance_options#log-level
log_level = "INFO"

# Sets the maximum amount of RAM Meilisearch can use when indexing.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#max-indexing-memory
# max_indexing_memory = "12 GiB"

# Sets the maximum number of threads Meilisearch can use during indexing.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#max-indexing-threads
# max_indexing_threads = 10

#############
### DUMPS ###
#############

# Sets the directory where Meilisearch will create dump files.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#dump-directory
dump_dir = "dumps/"

# Imports the dump file located at the specified path. Path must point to a .dump file.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#import-dump
# import_dump = "./path/to/my/file.dump"

# Prevents Meilisearch from throwing an error when `import_dump` does not point to a valid dump file.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#ignore-missing-dump
ignore_missing_dump = false

# Prevents a Meilisearch instance with an existing database from throwing an error when using `import_dump`.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#ignore-dump-if-db-exists
ignore_dump_if_db_exists = false


#################
### SNAPSHOTS ###
#################

# Enables scheduled snapshots when true, disable when false (the default).
# If the value is given as an integer, then enables the scheduled snapshot with the passed value as the interval
# between each snapshot, in seconds.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#schedule-snapshot-creation
schedule_snapshot = false

# Sets the directory where Meilisearch will store snapshots.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#snapshot-destination
snapshot_dir = "snapshots/"

# Launches Meilisearch after importing a previously-generated snapshot at the given filepath.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#import-snapshot
# import_snapshot = "./path/to/my/snapshot"

# Prevents a Meilisearch instance from throwing an error when `import_snapshot` does not point to a valid snapshot file.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#ignore-missing-snapshot
ignore_missing_snapshot = false

# Prevents a Meilisearch instance with an existing database from throwing an error when using `import_snapshot`.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#ignore-snapshot-if-db-exists
ignore_snapshot_if_db_exists = false


###########
### SSL ###
###########

# Enables client authentication in the specified path.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#ssl-authentication-path
# ssl_auth_path = "./path/to/root"

# Sets the server's SSL certificates.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#ssl-certificates-path
# ssl_cert_path = "./path/to/certfile"

# Sets the server's SSL key files.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#ssl-key-path
# ssl_key_path = "./path/to/private-key"

# Sets the server's OCSP file.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#ssl-ocsp-path
# ssl_ocsp_path = "./path/to/ocsp-file"

# Makes SSL authentication mandatory.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#ssl-require-auth
ssl_require_auth = false

# Activates SSL session resumption.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#ssl-resumption
ssl_resumption = false

# Activates SSL tickets.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#ssl-tickets
ssl_tickets = false

#############################
### Experimental features ###
#############################

# Experimental metrics feature. For more information, see: <https://github.com/meilisearch/meilisearch/discussions/3518>
# Enables the Prometheus metrics on the `GET /metrics` endpoint.
experimental_enable_metrics = false

# Experimental RAM reduction during indexing, do not use in production, see: <https://github.com/meilisearch/product/discussions/652>
experimental_reduce_indexing_memory_usage = false
68 changes: 68 additions & 0 deletions backend/src/meilisearch/csv_to_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import json
from ast import literal_eval
import pandas as pd


def csv_to_json():
"""
Transforms one csv file into (multiple) json files
Make sure the csv file comes in the right format and that the allocation of attributes
matches the code and your file.
For more information on the expected formate check the
meilisearch documentation.
Make sure the output files are not bigger than meilisearch max_payload!
"""

input_file = str(input("Please give the input file: "))
filesize = int(input("How many abstracts should be saved in one file? (recommended ~ 100k): "))
print(f"Loading {input_file} ...")
df = pd.read_csv(input_file, on_bad_lines="warn")

listofdata = []
y = 0 # used to create different filenames

print(f"Processing a total of {len(df)} abstracts...")

for x in range(len(df)): # going through every line in our file
if (x % filesize == 0) and (x != 0): # split into several output files
filename = f"Pubmed{y}.json"
with open(filename, "w", encoding="utf-8") as outfile:
json_object = json.dumps(listofdata, indent=4)
outfile.write(json_object)

y += 1
print(f"{y} / {len(df) // filesize + 1}...") # print status
listofdata = []

# retriving attributes from file, make sure they are json compatible
pubmed_id = str(df.iloc[x, 0])
title = df.iloc[x, 1]
abstract = df.iloc[x, 2]
cited_by = df.iloc[x, 3]
cited_number = str(len(literal_eval(cited_by)))
published_year = str(df.iloc[x, 4])

dictionary = {
"PubMed ID": pubmed_id,
"Title": title,
"Abstract": abstract,
"Cited by": cited_by,
"Cited number": cited_number,
"Published": published_year,
}

listofdata.append(dictionary)

# save the last last data into the last file
filename = f"Pubmed{y}.json"
with open(filename, "w", encoding="utf-8") as outfile:
json_object = json.dumps(listofdata, indent=4)
outfile.write(json_object)

print("Done")


if __name__ == "__main__":
csv_to_json()
94 changes: 94 additions & 0 deletions backend/src/meilisearch/meilisearch_add_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import json
import time
import Api_key
import meilisearch


def add_data(client):
"""
Uploads the given files to meilisearch
Takes any number of json files and uploads them to meilisearch.
Please make sure you follow the naming conventions. If you want to upload multiple files
give them the same name with ascending number after the name - starting at 0
Example: upload_to_db_0.json, upload_to_db_1.json, upload_to_db_2.json,....
The upload speed depends on the size of your files, the program crashes without error if the
size of one of your documents is bigger than the max payload limit set in meilisearch.
Recommended size < 350MB
"""

number_files = int(input("How many files do you want to upload?: "))

if number_files > 1:
print(
"\nPlease make sure you files are named correct -"
"Same name, ascending numbers at the end starting with 0:"
"\ntest1.json, test2.json, test3.json ... \n"
)
input_file = str(
input(
"Please only give the name without .json ending and number \
\nlike this: 'test' for test0.json: "
)
)
multiple_files = True
elif number_files == 1:
input_file = str(input("Please give the input file you would like to upload: "))
multiple_files = False
else:
return

index = str(input("What index do you want the data to be added to? "))
print("Processing, this may take a while...")

for x in range(number_files):
if multiple_files:
filename = input_file + str(x) + ".json"
else:
filename = input_file

# open the file with out data - has to be json!!!
with open(filename, "r", encoding="utf-8") as json_file:
data = json.load(json_file)

print("Loading: ", filename)

task = client.index(index).add_documents(data)
task_id = task.task_uid

while task.status == "enqueued" or task.status == "processing":
time.sleep(10) # we use sleep to avoid spamming the server with status requests
task = client.get_task(task_id)

if task.status == "failed":
print("Failed to upload to meilisearch, check meilisearch console")
return

if x + 1 < number_files:
print(f"{x+1} / {number_files} - {filename} uploaded successfully")
else:
print(f"Done - {filename} uploaded successfully")


def delete_index(client):
"""Deletes the given index"""
index = str(input("What index do you want to delete? "))
client.delete_index(index)


def main():
"""docstring"""
# setup the client connection to the database using the master API key to identify us
client = meilisearch.Client("http://localhost:7700", Api_key.ADMIN_API_KEY)

add_data(client)

# Uncomment this to delte an index
# delete_index(client)


if __name__ == "__main__":
main()
Loading

0 comments on commit 0985873

Please sign in to comment.