diff --git a/backend/.gitignore b/backend/.gitignore index ff3e8f8..0e3a7d9 100644 --- a/backend/.gitignore +++ b/backend/.gitignore @@ -1,6 +1,11 @@ venv -sources -uploads + +sources/ +!sources/sample.hackthehill + +uploads/ +!uploads/sample.txt + __pycache__ .coverage .idea \ No newline at end of file diff --git a/backend/code/config.py b/backend/code/config.py deleted file mode 100644 index 03e5b5c..0000000 --- a/backend/code/config.py +++ /dev/null @@ -1,11 +0,0 @@ -""" -TODO -""" - -import os.path - -DISCOVERY_PORT = 5000 -CHAT_PORT = 5001 -MAX_UDP_PACKET = 65507 -DISCOVERY_ADDRESS = '192.168.181.255' -UPLOAD_FOLDER = os.path.abspath("../uploads") diff --git a/backend/code/file_tokenizer.py b/backend/code/file_tokenizer.py index 3901ee2..ad0b774 100755 --- a/backend/code/file_tokenizer.py +++ b/backend/code/file_tokenizer.py @@ -8,6 +8,8 @@ from pathlib import Path +from config import SOURCES_FOLDER + class SenderTokenizer: """ @@ -31,6 +33,7 @@ def hash_file_blocks(self, block_size=512): be used to hide the nature of the file in communication """ + hackthehill_file = Path(self.file_path).stem + ".hackthehill" file_size: int = os.path.getsize(self.file_path) num_blocks = (file_size + block_size - 1) // block_size # Round up division @@ -58,8 +61,7 @@ def hash_file_blocks(self, block_size=512): hash_block = json.dumps(header, indent=2) - with open("./sources/" + Path(self.file_path).stem + ".hackthehill", - 'w', encoding="utf-8") as f: + with open(os.path.join(SOURCES_FOLDER, hackthehill_file), 'w', encoding="utf-8") as f: f.write(hash_block) return hashlib.sha256(json.dumps(header).encode('utf-8')).hexdigest() diff --git a/backend/code/main.py b/backend/code/main.py index 9768f87..31adcea 100644 --- a/backend/code/main.py +++ b/backend/code/main.py @@ -6,7 +6,7 @@ import io import json -from code.config import UPLOAD_FOLDER +from config import UPLOADS_FOLDER from code.utils import get_filename_by_file_id, custom_hash from code.p2p_client import P2PClient from code.file_tokenizer import SenderTokenizer @@ -20,8 +20,8 @@ CORS(app) # Ensure the upload directory exists -if not os.path.exists(UPLOAD_FOLDER): - os.makedirs(UPLOAD_FOLDER) +if not os.path.exists(UPLOADS_FOLDER): + os.makedirs(UPLOADS_FOLDER) fileData = {} diff --git a/backend/code/p2p_client.py b/backend/code/p2p_client.py index 60dd24d..ec56f4b 100644 --- a/backend/code/p2p_client.py +++ b/backend/code/p2p_client.py @@ -13,7 +13,7 @@ from code.utils import get_filename_by_file_id from code.file_tokenizer import ReceiverTokenizer -from code.config import DISCOVERY_PORT, CHAT_PORT, MAX_UDP_PACKET, DISCOVERY_ADDRESS +from config import DISCOVERY_PORT, CHAT_PORT, MAX_UDP_PACKET, DISCOVERY_ADDRESS class P2PClient: diff --git a/backend/code/utils.py b/backend/code/utils.py index 6528a45..1a87ee7 100644 --- a/backend/code/utils.py +++ b/backend/code/utils.py @@ -10,10 +10,12 @@ from pathlib import Path from typing import Optional +from config import SOURCES_FOLDER, UPLOADS_FOLDER + def custom_hash(encode_input): """ - Using the hashlib sha256 encoding, we specifically hash the utf-8 encoding for + Using the hashlib sha256 encoding, we specifically hash the utf-8 encoding for text files. This is our custom hash function we use in the entire project, we should not use the inbuilt hash() function """ @@ -24,11 +26,11 @@ def custom_hash(encode_input): def find_file(directory: str, filename: str) -> Optional[str]: """ Helper function for get_filename_by_file_id - + Given a directory and a filename, check if the directory exists, and if the file exists inside the given directory. If yes, return the file name, otherwise return None - + :param directory: str :param filename: str :returns: Either string or None @@ -37,23 +39,28 @@ def find_file(directory: str, filename: str) -> Optional[str]: if os.path.exists(directory): for file in os.listdir(directory): name, _ = os.path.splitext(file) - if name == filename: + if name == Path(filename).stem: return file return None -def get_filename_by_file_id(file_id): +def get_filename_by_file_id(file_id: str) -> Optional[tuple[str, str]]: """ - TODO + Given the unique file id we have, we check if original file which is related to + that particular file id exists or not. Returns None if file does not exist in that directory + Note that by our convention, .hackthehill files exist in backend/sources and the + original files exist in backend/uploads. + + :param file_id: str + :returns: tuple[str, str] | None """ - for fingerprint_file_name in os.listdir("sources"): - with open(os.path.join("sources", fingerprint_file_name), "r", encoding="utf-8") as f: - file_fingerprint_content = f.read() + for hackthehill_file in os.listdir(SOURCES_FOLDER): + with open(os.path.join(SOURCES_FOLDER, hackthehill_file), "r", encoding="utf-8") as f: + hackthehill_file_content = f.read() - if file_id == custom_hash(file_fingerprint_content): - file_name = find_file( - "uploads", Path(fingerprint_file_name).stem) - if file_name is not None: - return [file_name, fingerprint_file_name] + if file_id == custom_hash(hackthehill_file_content): + original_file_name = find_file(UPLOADS_FOLDER, hackthehill_file) + if original_file_name is not None: + return original_file_name, hackthehill_file return None diff --git a/backend/config.py b/backend/config.py new file mode 100644 index 0000000..27aa102 --- /dev/null +++ b/backend/config.py @@ -0,0 +1,19 @@ +""" +Global variables for the project + +Since the file contains actual directory names, it must be at the root of backend +""" + +import os.path + +DISCOVERY_PORT = 5000 +CHAT_PORT = 5001 +MAX_UDP_PACKET = 65507 +DISCOVERY_ADDRESS = '192.168.181.255' + +BACKEND_FOLDER = os.path.dirname(os.path.abspath(__file__)) + +UPLOADS_FOLDER = os.path.join(BACKEND_FOLDER, "uploads") +SOURCES_FOLDER = os.path.join(BACKEND_FOLDER, "sources") +CODE_FOLDER = os.path.join(BACKEND_FOLDER, "code") +TEST_FOLDER = os.path.join(BACKEND_FOLDER, "test") diff --git a/backend/sources/sample.hackthehill b/backend/sources/sample.hackthehill new file mode 100644 index 0000000..ee03ba3 --- /dev/null +++ b/backend/sources/sample.hackthehill @@ -0,0 +1 @@ +Adding the file just so that sources folder is recognized via github diff --git a/backend/sourcestest.hackthehill b/backend/sourcestest.hackthehill new file mode 100644 index 0000000..be3a143 --- /dev/null +++ b/backend/sourcestest.hackthehill @@ -0,0 +1,9 @@ +{ + "header": { + "file_name": "test.txt", + "file_size": 0, + "number_of_blocks": 0, + "block_size": 512 + }, + "blocks": {} +} \ No newline at end of file diff --git a/backend/test/test_utils.py b/backend/test/test_utils.py index 0199924..a7c4d51 100644 --- a/backend/test/test_utils.py +++ b/backend/test/test_utils.py @@ -2,22 +2,27 @@ Testing the Utilities class """ import hashlib +import os.path +import pathlib import unittest -from code.utils import find_file, custom_hash + +from code.file_tokenizer import SenderTokenizer +from config import SOURCES_FOLDER, CODE_FOLDER, UPLOADS_FOLDER +from code.utils import find_file, custom_hash, get_filename_by_file_id class TestUtils(unittest.TestCase): """ Utilities class functions are generic and widely used. Their implementation is very important. """ - + def test_custom_hash(self): """ We have our own implementation for hashing utf-8 format encoding """ - + encoded_input = "Hi Mom" - + self.assertEqual(custom_hash(encoded_input), hashlib.sha256(encoded_input.encode("utf-8")).hexdigest()) def test_find_file_with_garbage_file_name_returns_none(self): @@ -33,13 +38,54 @@ def test_find_file_with_garbage_directory_name_returns_none(self): """ self.assertEqual(find_file("random_directory", "uploads.txt"), None) - + def test_find_file_with_proper_director_and_file_returns_filename(self): """ If a file and directory exists, and file exists inside the directory, return the filename """ - - self.assertEqual(find_file("code", "utils"), "utils.py") - - \ No newline at end of file + + self.assertEqual(find_file(CODE_FOLDER, "utils.py"), "utils.py") + + def test_get_filename_by_file_id_with_no_matching_id_returns_none(self): + """ + If none of the hashed ids match with the file id, we should return None + """ + + testing_file = os.path.join(UPLOADS_FOLDER, "test_get_filename_by_file_id_with_no_matching_id_returns_none.txt") + message = "test_get_filename_by_file_id_with_no_matching_id_returns_none" + + with open(testing_file, "x", encoding="utf-8") as f: + f.write(message) + file_id = "random-file-id" + + self.assertEqual(get_filename_by_file_id(file_id), None) + + os.remove(testing_file) + + def test_get_filename_by_file_id_with_matching_id_returns_tuple(self): + """ + If the hashed id is the same as the hash of .hackthehill file, we should return the tuple of filenames + """ + + testing_file = os.path.join(UPLOADS_FOLDER, + "test_get_filename_by_file_id_with_matching_id_returns_tuple.txt") + hackthehill_file = os.path.join(SOURCES_FOLDER, + "test_get_filename_by_file_id_with_matching_id_returns_tuple.hackthehill") + + hashed_file = SenderTokenizer(testing_file) + message = "test_get_filename_by_file_id_with_matching_id_returns_tuple" + + with open(testing_file, "x", encoding="utf-8") as f: + f.write(message) + hashed_file.hash_file_blocks() + + with open(hackthehill_file, "r", encoding="utf-8") as f: + hackthehill_file_content = f.read() + file_id = custom_hash(hackthehill_file_content) + + self.assertEqual(get_filename_by_file_id(file_id), (os.path.basename(testing_file), + os.path.basename(hackthehill_file))) + + os.remove(testing_file) + os.remove(hackthehill_file) diff --git a/backend/uploads/sample.txt b/backend/uploads/sample.txt new file mode 100644 index 0000000..02fb357 --- /dev/null +++ b/backend/uploads/sample.txt @@ -0,0 +1 @@ +Adding the file just so that uploads folder is recognized via github diff --git a/setup.sh b/setup.sh index a1e0ceb..9d4c412 100755 --- a/setup.sh +++ b/setup.sh @@ -12,5 +12,7 @@ cd "$backendDir" python3.9 -m venv venv source venv/bin/activate pip install -r requirements.txt +mkdir sources +mkdir uploads cd "$currentDir"