-
Notifications
You must be signed in to change notification settings - Fork 84
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Synchronize between uploading file and generating index thread #4443
base: master
Are you sure you want to change the base?
Changes from 60 commits
ac5ed66
b9fbc35
20aea6d
a1842a4
ce7de68
78e2b0d
14cbc59
fda189a
54507b9
40e0fe8
de13ee5
2db90e7
bde9f10
cdb2240
0a6e9da
d4c6b58
3a9f8d0
8ba4f68
0f409b0
5c9cc8a
d4509f6
99402f7
5f55e54
f29e7fa
5cc9025
cbbb4f9
5ce3bf8
0490cf5
90b13f0
e48d91b
32d39db
7e8f511
28e5e14
4111e81
ef3c907
c17aa9a
ce715e8
8b28b1f
76d2888
61617b3
b0af9ed
7741c94
f193e80
77480f1
1bd3bbd
8bb7c5b
d6bdefd
f0e3215
6d86462
2ab01be
f6ab881
f4589ba
f89109b
c06f37d
75f81af
6b96ebb
9f524ef
7f02173
5d6c496
5c3c7b5
12c3fe0
b2cf26b
3037b17
a7b473d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,6 +28,7 @@ Thumbs.db | |
*MANIFEST | ||
*.egg-info | ||
venv*/ | ||
*/__pycache__/* | ||
|
||
/nav | ||
/tags | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,11 +10,16 @@ class MultiReaderFileStream(BytesIO): | |
""" | ||
NUM_READERS = 2 | ||
|
||
# MAX memory usage <= MAX_BUF_SIZE + max(num_bytes called in read) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What does this comment mean? Can you add a description of what MAX_BUF_SIZE is used for? |
||
MAX_BUF_SIZE = 1024 * 1024 * 1024 # 10 MiB for test | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What value should it be for non-test? |
||
|
||
|
||
def __init__(self, fileobj): | ||
self._bufs = [BytesBuffer() for _ in range(0, self.NUM_READERS)] | ||
self._pos = [0 for _ in range(0, self.NUM_READERS)] | ||
self._fileobj = fileobj | ||
self._lock = Lock() # lock to ensure one does not concurrently read self._fileobj / write to the buffers. | ||
self._current_max_buf_length = 0 | ||
|
||
class FileStreamReader(BytesIO): | ||
def __init__(s, index): | ||
|
@@ -36,15 +41,39 @@ def _fill_buf_bytes(self, index: int, num_bytes=None): | |
break | ||
for i in range(0, self.NUM_READERS): | ||
self._bufs[i].write(s) | ||
self.find_largest_buffer() | ||
|
||
def find_largest_buffer(self): | ||
self._current_max_buf_length = len(self._bufs[0]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add a docstring comment |
||
for i in range(1, self.NUM_READERS): | ||
self._current_max_buf_length = max(self._current_max_buf_length, len(self._bufs[i])) | ||
# print(f"find largest buffer: {self._current_max_buf_length} in thread: {threading.current_thread().name}") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove comments |
||
|
||
|
||
def read(self, index: int, num_bytes=None): # type: ignore | ||
"""Read the specified number of bytes from the associated file. | ||
index: index that specifies which reader is reading. | ||
""" | ||
|
||
# print(f"calling read() in thread {threading.current_thread().name}, num_bytes={num_bytes}") | ||
# busy waiting until | ||
while(self._current_max_buf_length > self.MAX_BUF_SIZE and len(self._bufs[index]) < self._current_max_buf_length): | ||
# only the slowest reader could read | ||
# print(f"Busy waiting in thread: {threading.current_thread().name}, current max_len = {self._current_max_buf_length}, current_buf_size = {len(self._bufs[index])}") | ||
pass | ||
|
||
# If current thread is the slowest reader, continue read. | ||
# If current thread is the slowest reader, and num_bytes > len(self._buf[index]) / num_bytes = None, will continue grow the buffer. | ||
# max memory usage <= MAX_BUF_SIZE + max(num_bytes called in read) | ||
self._fill_buf_bytes(index, num_bytes) | ||
assert self._current_max_buf_length <= 2 * self.MAX_BUF_SIZE | ||
if num_bytes is None: | ||
num_bytes = len(self._bufs[index]) | ||
s = self._bufs[index].read(num_bytes) | ||
self.find_largest_buffer() | ||
# print("Current thread name: ", threading.current_thread().name) | ||
|
||
|
||
self._pos[index] += len(s) | ||
return s | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -71,6 +71,7 @@ def test_not_found(self): | |
|
||
def check_file_target_contents(self, target): | ||
"""Checks to make sure that the specified file has the contents 'hello world'.""" | ||
# This can not be checked, Since | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Update comment? |
||
with self.download_manager.stream_file(target, gzipped=False) as f: | ||
self.assertEqual(f.read(), b"hello world") | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we refactor this into a constant? Also, is 10 hours enough or might we need more for even larger files?