Skip to content

Commit

Permalink
Improve upload speed when posting files to batch api (#75)
Browse files Browse the repository at this point in the history
Use file-like object instead of using the file contents when posting a batch job
  • Loading branch information
anjz authored Oct 19, 2023
1 parent fc57b65 commit a5ad78e
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 19 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

- Improve upload speeds for files submitted with the batch client

## [1.11.0] - 2023-08-25

### Added
Expand Down
49 changes: 30 additions & 19 deletions speechmatics/batch_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,28 +223,39 @@ def submit_job(
)

# If audio=None, fetch_data must be specified
if audio and "fetch_data" in config_dict:
raise ValueError("Only one of audio or fetch_data can be set at a time")
if not audio and "fetch_data" in config_dict:
audio_data = None
elif isinstance(audio, (str, os.PathLike)):
with Path(audio).expanduser().open("rb") as file:
audio_data = os.path.basename(file.name), file.read()
elif isinstance(audio, tuple) and "fetch_data" not in config_dict:
audio_data = audio
else:
raise ValueError("Audio must be a filepath or a tuple of (filename, bytes)")
file_object = None
try:
if audio and "fetch_data" in config_dict:
raise ValueError("Only one of audio or fetch_data can be set at a time")
if not audio and "fetch_data" in config_dict:
audio_data = None
elif isinstance(audio, (str, os.PathLike)):
# httpx performance is better when using a file-like object
# compared to passing the file contents as bytes.
file_object = Path(audio).expanduser().open("rb")
audio_data = os.path.basename(file_object.name), file_object
elif isinstance(audio, tuple) and "fetch_data" not in config_dict:
audio_data = audio
else:
raise ValueError(
"Audio must be a filepath or a tuple of (filename, bytes)"
)

# httpx seems to expect an un-nested json, throws a type error otherwise.
config_data = {"config": json.dumps(config_dict, ensure_ascii=False)}
# httpx seems to expect an un-nested json, throws a type error otherwise.
config_data = {"config": json.dumps(config_dict, ensure_ascii=False)}

if audio_data:
audio_file = {"data_file": audio_data}
else:
audio_file = _ForceMultipartDict()
if audio_data:
audio_file = {"data_file": audio_data}
else:
audio_file = _ForceMultipartDict()

response = self.send_request("POST", "jobs", data=config_data, files=audio_file)
return response.json()["id"]
response = self.send_request(
"POST", "jobs", data=config_data, files=audio_file
)
return response.json()["id"]
finally:
if file_object:
file_object.close()

def submit_jobs(
self,
Expand Down

0 comments on commit a5ad78e

Please sign in to comment.