From 457ecdf7a1e198a370e69bda36c56312a5ee9283 Mon Sep 17 00:00:00 2001 From: Samuel Park Date: Wed, 12 Jul 2023 16:31:43 -0400 Subject: [PATCH] RTX 4090 laptop results --- .../PyTorch_SSD_AMP/12-07-2023_20-08-06.txt | 51 ++ .../PyTorch_SSD_AMP/benchmark.para | 2 + .../PyTorch_SSD_FP32/12-07-2023_20-15-34.txt | 49 ++ .../PyTorch_SSD_FP32/benchmark.para | 2 + .../12-07-2023_20-20-52.txt | 48 ++ .../benchmark.para | 2 + .../12-07-2023_20-09-35.txt | 33 + .../benchmark.para | 2 + .../12-07-2023_20-07-03.txt | 48 ++ .../benchmark.para | 2 + .../12-07-2023_19-53-59.txt | 34 + .../benchmark.para | 2 + .../PyTorch_gnmt_FP16/12-07-2023_20-18-34.txt | 233 ++++++ .../PyTorch_gnmt_FP16/benchmark.para | 2 + .../PyTorch_gnmt_FP32/12-07-2023_20-02-53.txt | 267 +++++++ .../PyTorch_gnmt_FP32/benchmark.para | 2 + .../PyTorch_ncf_FP16/12-07-2023_19-57-50.txt | 39 + .../PyTorch_ncf_FP16/benchmark.para | 2 + .../PyTorch_ncf_FP32/12-07-2023_19-50-49.txt | 20 + .../PyTorch_ncf_FP32/benchmark.para | 2 + .../12-07-2023_19-52-22.txt | 217 ++++++ .../PyTorch_resnet50_AMP/benchmark.para | 2 + .../12-07-2023_20-15-45.txt | 217 ++++++ .../PyTorch_resnet50_FP32/benchmark.para | 2 + .../12-07-2023_19-51-19.txt | 173 +++++ .../PyTorch_tacotron2_FP16/benchmark.para | 2 + .../12-07-2023_20-12-14.txt | 170 +++++ .../PyTorch_tacotron2_FP32/benchmark.para | 2 + .../12-07-2023_20-08-21.txt | 202 +++++ .../benchmark.para | 2 + .../12-07-2023_20-17-16.txt | 183 +++++ .../benchmark.para | 2 + .../12-07-2023_20-10-32.txt | 202 +++++ .../benchmark.para | 2 + .../12-07-2023_20-00-32.txt | 185 +++++ .../benchmark.para | 2 + .../12-07-2023_19-58-10.txt | 709 +++++++++++++++++ .../PyTorch_waveglow_FP16/benchmark.para | 2 + .../12-07-2023_20-13-13.txt | 710 ++++++++++++++++++ .../PyTorch_waveglow_FP32/benchmark.para | 2 + pytorch/results/4090laptop_v1/summary.txt | 22 + pytorch/results/4090laptop_v1/sys_pytorch.txt | 10 + .../config_v1/config_pytorch_4090laptop_v1.sh | 7 + 43 files changed, 3869 insertions(+) create mode 100755 pytorch/results/4090laptop_v1/PyTorch_SSD_AMP/12-07-2023_20-08-06.txt create mode 100755 pytorch/results/4090laptop_v1/PyTorch_SSD_AMP/benchmark.para create mode 100755 pytorch/results/4090laptop_v1/PyTorch_SSD_FP32/12-07-2023_20-15-34.txt create mode 100755 pytorch/results/4090laptop_v1/PyTorch_SSD_FP32/benchmark.para create mode 100755 pytorch/results/4090laptop_v1/PyTorch_bert_base_squad_FP16/12-07-2023_20-20-52.txt create mode 100755 pytorch/results/4090laptop_v1/PyTorch_bert_base_squad_FP16/benchmark.para create mode 100755 pytorch/results/4090laptop_v1/PyTorch_bert_base_squad_FP32/12-07-2023_20-09-35.txt create mode 100755 pytorch/results/4090laptop_v1/PyTorch_bert_base_squad_FP32/benchmark.para create mode 100755 pytorch/results/4090laptop_v1/PyTorch_bert_large_squad_FP16/12-07-2023_20-07-03.txt create mode 100755 pytorch/results/4090laptop_v1/PyTorch_bert_large_squad_FP16/benchmark.para create mode 100755 pytorch/results/4090laptop_v1/PyTorch_bert_large_squad_FP32/12-07-2023_19-53-59.txt create mode 100755 pytorch/results/4090laptop_v1/PyTorch_bert_large_squad_FP32/benchmark.para create mode 100755 pytorch/results/4090laptop_v1/PyTorch_gnmt_FP16/12-07-2023_20-18-34.txt create mode 100755 pytorch/results/4090laptop_v1/PyTorch_gnmt_FP16/benchmark.para create mode 100755 pytorch/results/4090laptop_v1/PyTorch_gnmt_FP32/12-07-2023_20-02-53.txt create mode 100755 pytorch/results/4090laptop_v1/PyTorch_gnmt_FP32/benchmark.para create mode 100755 pytorch/results/4090laptop_v1/PyTorch_ncf_FP16/12-07-2023_19-57-50.txt create mode 100755 pytorch/results/4090laptop_v1/PyTorch_ncf_FP16/benchmark.para create mode 100755 pytorch/results/4090laptop_v1/PyTorch_ncf_FP32/12-07-2023_19-50-49.txt create mode 100755 pytorch/results/4090laptop_v1/PyTorch_ncf_FP32/benchmark.para create mode 100755 pytorch/results/4090laptop_v1/PyTorch_resnet50_AMP/12-07-2023_19-52-22.txt create mode 100755 pytorch/results/4090laptop_v1/PyTorch_resnet50_AMP/benchmark.para create mode 100755 pytorch/results/4090laptop_v1/PyTorch_resnet50_FP32/12-07-2023_20-15-45.txt create mode 100755 pytorch/results/4090laptop_v1/PyTorch_resnet50_FP32/benchmark.para create mode 100755 pytorch/results/4090laptop_v1/PyTorch_tacotron2_FP16/12-07-2023_19-51-19.txt create mode 100755 pytorch/results/4090laptop_v1/PyTorch_tacotron2_FP16/benchmark.para create mode 100755 pytorch/results/4090laptop_v1/PyTorch_tacotron2_FP32/12-07-2023_20-12-14.txt create mode 100755 pytorch/results/4090laptop_v1/PyTorch_tacotron2_FP32/benchmark.para create mode 100755 pytorch/results/4090laptop_v1/PyTorch_transformerxlbase_FP16/12-07-2023_20-08-21.txt create mode 100755 pytorch/results/4090laptop_v1/PyTorch_transformerxlbase_FP16/benchmark.para create mode 100755 pytorch/results/4090laptop_v1/PyTorch_transformerxlbase_FP32/12-07-2023_20-17-16.txt create mode 100755 pytorch/results/4090laptop_v1/PyTorch_transformerxlbase_FP32/benchmark.para create mode 100755 pytorch/results/4090laptop_v1/PyTorch_transformerxllarge_FP16/12-07-2023_20-10-32.txt create mode 100755 pytorch/results/4090laptop_v1/PyTorch_transformerxllarge_FP16/benchmark.para create mode 100755 pytorch/results/4090laptop_v1/PyTorch_transformerxllarge_FP32/12-07-2023_20-00-32.txt create mode 100755 pytorch/results/4090laptop_v1/PyTorch_transformerxllarge_FP32/benchmark.para create mode 100755 pytorch/results/4090laptop_v1/PyTorch_waveglow_FP16/12-07-2023_19-58-10.txt create mode 100755 pytorch/results/4090laptop_v1/PyTorch_waveglow_FP16/benchmark.para create mode 100755 pytorch/results/4090laptop_v1/PyTorch_waveglow_FP32/12-07-2023_20-13-13.txt create mode 100755 pytorch/results/4090laptop_v1/PyTorch_waveglow_FP32/benchmark.para create mode 100644 pytorch/results/4090laptop_v1/summary.txt create mode 100755 pytorch/results/4090laptop_v1/sys_pytorch.txt create mode 100644 pytorch/scripts/config_v1/config_pytorch_4090laptop_v1.sh diff --git a/pytorch/results/4090laptop_v1/PyTorch_SSD_AMP/12-07-2023_20-08-06.txt b/pytorch/results/4090laptop_v1/PyTorch_SSD_AMP/12-07-2023_20-08-06.txt new file mode 100755 index 00000000..9ac990be --- /dev/null +++ b/pytorch/results/4090laptop_v1/PyTorch_SSD_AMP/12-07-2023_20-08-06.txt @@ -0,0 +1,51 @@ +/opt/conda/lib/python3.8/site-packages/torchvision/models/_utils.py:208: UserWarning: The parameter 'pretrained' is deprecated since 0.13 and will be removed in 0.15, please use 'weights' instead. + warnings.warn( +/opt/conda/lib/python3.8/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and will be removed in 0.15. The current behavior is equivalent to passing `weights=ResNet50_Weights.IMAGENET1K_V1`. You can also use `weights=ResNet50_Weights.DEFAULT` to get the most up-to-date weights. + warnings.warn(msg) +Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth + 0%| | 0.00/97.8M [00:00 + train(train_loop_func, logger, args) + File "main.py", line 146, in train + cocoGt = get_coco_ground_truth(args) + File "/workspace/benchmark/Detection/SSD/ssd/data.py", line 73, in get_coco_ground_truth + cocoGt = COCO(annotation_file=val_annotate, use_ext=True) + File "/opt/conda/lib/python3.8/site-packages/pycocotools/coco.py", line 89, in __init__ + dataset = json.load(open(annotation_file, 'r')) +FileNotFoundError: [Errno 2] No such file or directory: '/data/object_detection/annotations/instances_val2017.json' +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 3160) of binary: /opt/conda/bin/python +Traceback (most recent call last): + File "/opt/conda/bin/torchrun", line 33, in + sys.exit(load_entry_point('torch==1.13.0a0+d0d6b1f', 'console_scripts', 'torchrun')()) + File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/opt/conda/lib/python3.8/site-packages/torch/distributed/run.py", line 762, in main + run(args) + File "/opt/conda/lib/python3.8/site-packages/torch/distributed/run.py", line 753, in run + elastic_launch( + File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 246, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +main.py FAILED +------------------------------------------------------------ +Failures: + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2023-07-12_20:08:16 + host : 24a2da7181c1 + rank : 0 (local_rank: 0) + exitcode : 1 (pid: 3160) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +============================================================ +DONE! diff --git a/pytorch/results/4090laptop_v1/PyTorch_SSD_AMP/benchmark.para b/pytorch/results/4090laptop_v1/PyTorch_SSD_AMP/benchmark.para new file mode 100755 index 00000000..70c9b31e --- /dev/null +++ b/pytorch/results/4090laptop_v1/PyTorch_SSD_AMP/benchmark.para @@ -0,0 +1,2 @@ +GLOBAL_BATCH 96 +GPU 1 diff --git a/pytorch/results/4090laptop_v1/PyTorch_SSD_FP32/12-07-2023_20-15-34.txt b/pytorch/results/4090laptop_v1/PyTorch_SSD_FP32/12-07-2023_20-15-34.txt new file mode 100755 index 00000000..07b0b326 --- /dev/null +++ b/pytorch/results/4090laptop_v1/PyTorch_SSD_FP32/12-07-2023_20-15-34.txt @@ -0,0 +1,49 @@ +/opt/conda/lib/python3.8/site-packages/torchvision/models/_utils.py:208: UserWarning: The parameter 'pretrained' is deprecated since 0.13 and will be removed in 0.15, please use 'weights' instead. + warnings.warn( +/opt/conda/lib/python3.8/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and will be removed in 0.15. The current behavior is equivalent to passing `weights=ResNet50_Weights.IMAGENET1K_V1`. You can also use `weights=ResNet50_Weights.DEFAULT` to get the most up-to-date weights. + warnings.warn(msg) +NOTE! Installing ujson may make loading annotations faster. +DLL 2023-07-12 20:15:36.177470 - PARAMETER dataset path : /data/object_detection epochs : 1 batch size : 48 eval batch size : 32 no cuda : False seed : None checkpoint path : None mode : benchmark-training eval on epochs : [21, 31, 37, 42, 48, 53, 59, 64] lr decay epochs : [43, 54] learning rate : 0.0 momentum : 0.9 weight decay : 0.0005 lr warmup : None backbone : resnet50 backbone path : None num workers : 4 AMP : False precision : fp32 +Using seed = 4198 +loading annotations into memory... +Traceback (most recent call last): + File "main.py", line 286, in + train(train_loop_func, logger, args) + File "main.py", line 146, in train + cocoGt = get_coco_ground_truth(args) + File "/workspace/benchmark/Detection/SSD/ssd/data.py", line 73, in get_coco_ground_truth + cocoGt = COCO(annotation_file=val_annotate, use_ext=True) + File "/opt/conda/lib/python3.8/site-packages/pycocotools/coco.py", line 89, in __init__ + dataset = json.load(open(annotation_file, 'r')) +FileNotFoundError: [Errno 2] No such file or directory: '/data/object_detection/annotations/instances_val2017.json' +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 5329) of binary: /opt/conda/bin/python +Traceback (most recent call last): + File "/opt/conda/bin/torchrun", line 33, in + sys.exit(load_entry_point('torch==1.13.0a0+d0d6b1f', 'console_scripts', 'torchrun')()) + File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/opt/conda/lib/python3.8/site-packages/torch/distributed/run.py", line 762, in main + run(args) + File "/opt/conda/lib/python3.8/site-packages/torch/distributed/run.py", line 753, in run + elastic_launch( + File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 246, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +main.py FAILED +------------------------------------------------------------ +Failures: + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2023-07-12_20:15:40 + host : 24a2da7181c1 + rank : 0 (local_rank: 0) + exitcode : 1 (pid: 5329) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +============================================================ +DONE! diff --git a/pytorch/results/4090laptop_v1/PyTorch_SSD_FP32/benchmark.para b/pytorch/results/4090laptop_v1/PyTorch_SSD_FP32/benchmark.para new file mode 100755 index 00000000..e35db812 --- /dev/null +++ b/pytorch/results/4090laptop_v1/PyTorch_SSD_FP32/benchmark.para @@ -0,0 +1,2 @@ +GLOBAL_BATCH 48 +GPU 1 diff --git a/pytorch/results/4090laptop_v1/PyTorch_bert_base_squad_FP16/12-07-2023_20-20-52.txt b/pytorch/results/4090laptop_v1/PyTorch_bert_base_squad_FP16/12-07-2023_20-20-52.txt new file mode 100755 index 00000000..f0a883c3 --- /dev/null +++ b/pytorch/results/4090laptop_v1/PyTorch_bert_base_squad_FP16/12-07-2023_20-20-52.txt @@ -0,0 +1,48 @@ +Container nvidia build = 46164382 +out dir is . +fp16 activated! +torchrun --nproc_per_node=1 run_squad.py --init_checkpoint=/data/bert_base/bert_base_uncased.pt --do_train --train_file=/data/squad/v1.1/train-v1.1.json --train_batch_size=48 --do_lower_case --bert_model=bert-large-uncased --learning_rate=0.0 --warmup_proportion=0.1 --seed=1 --num_train_epochs=2.0 --max_seq_length=384 --doc_stride=128 --output_dir=. --vocab_file=/data/bert_base/bert-base-uncased-vocab.txt --config_file=/data/bert_base/bert_config.json --max_steps=100 --fp16 |& tee ./logfile.txt +07/12/2023 20:20:55 - INFO - torch.distributed.distributed_c10d - Added key: store_based_barrier_key:1 to store for rank: 0 +07/12/2023 20:20:55 - INFO - torch.distributed.distributed_c10d - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 1 nodes. +device: cuda:0 n_gpu: 1, distributed training: True, 16-bits training: True +DLL 2023-07-12 20:20:55.811283 - PARAMETER Config : ["Namespace(amp=False, bert_model='bert-large-uncased', cache_dir=None, config_file='/data/bert_base/bert_config.json', disable_progress_bar=False, do_eval=False, do_lower_case=True, do_predict=False, do_train=True, doc_stride=128, eval_script='evaluate.py', fp16=True, gradient_accumulation_steps=1, init_checkpoint='/data/bert_base/bert_base_uncased.pt', json_summary='results/dllogger.json', learning_rate=0.0, local_rank=0, log_freq=50, loss_scale=0, max_answer_length=30, max_query_length=64, max_seq_length=384, max_steps=100.0, n_best_size=20, no_cuda=False, null_score_diff_threshold=0.0, num_train_epochs=2.0, output_dir='.', predict_batch_size=8, predict_file=None, seed=1, skip_cache=False, skip_checkpoint=False, train_batch_size=48, train_file='/data/squad/v1.1/train-v1.1.json', use_env=False, verbose_logging=False, version_2_with_negative=False, vocab_file='/data/bert_base/bert-base-uncased-vocab.txt', warmup_proportion=0.1)"] +DLL 2023-07-12 20:20:55.811401 - PARAMETER SEED : 1 +WARNING: Output directory . already exists and is not empty. ['tokenization_utils.py', 'requirements.txt', 'checkpoints', 'bert_configs', 'lamb_amp_opt', 'configurations.yml', 'schedulers.py', 'README.md', 'distillation', 'create_pretraining_data.py', '.gitmodules', 'data', 'vocab', 'tokenization.py', 'Dockerfile', 'extract_features.py', 'run.sub', 'images', 'processors', 'run_swag.py', 'run_pretraining.py', 'inference.py', 'config.json', 'scripts', 'modeling.py', 'LICENSE', 'logfile.txt', 'optimization.py', 'pytorch_model.bin', '.dockerignore', 'run_glue.py', 'run_squad.py', 'triton', 'NOTICE', '__pycache__', 'results', '.gitignore', 'utils.py', 'bind.sh', 'bind_pyt.py', 'file_utils.py'] +DLL 2023-07-12 20:21:00.032482 - PARAMETER loading_checkpoint : True +DLL 2023-07-12 20:21:00.032556 - PARAMETER loaded_checkpoint : True +DLL 2023-07-12 20:21:00.154267 - PARAMETER model_weights_num : 109488386 +Selected optimization level O2: FP16 training with FP32 batchnorm and FP32 master weights. + +Defaults for this optimization level are: +enabled : True +opt_level : O2 +cast_model_type : torch.float16 +patch_torch_functions : False +keep_batchnorm_fp32 : True +master_weights : True +loss_scale : dynamic +Processing user overrides (additional kwargs that are not None)... +After processing overrides, optimization options are: +enabled : True +opt_level : O2 +cast_model_type : torch.float16 +patch_torch_functions : False +keep_batchnorm_fp32 : False +master_weights : True +loss_scale : dynamic +DLL 2023-07-12 20:21:04.025876 - PARAMETER train_start : True +DLL 2023-07-12 20:21:04.025955 - PARAMETER training_samples : 87599 +DLL 2023-07-12 20:21:04.025971 - PARAMETER training_features : 88641 +DLL 2023-07-12 20:21:04.025981 - PARAMETER train_batch_size : 48 +DLL 2023-07-12 20:21:04.025991 - PARAMETER steps : 3648.0 + Iteration: 0%| | 0/1847 [00:00