-
Notifications
You must be signed in to change notification settings - Fork 12
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
The loss value is really big and drop slowly, is this normal? #4
Comments
According to your data, the training hasn't completed an epoch. Therefore, this is normal. |
Thank you. And I find the data loading time need about 20 min of each iter. So the GPU is always waiting for cpu to process frame from .avi file, is there any other way besides more threading(cpu is almost full power running) to boost data loading time? |
Yes, this is a problem. We use the DMMN/dataset/amot/videocaptureasync.py Lines 34 to 41 in a644196
|
OK. Let me try. |
I use ffmpeg to obtain each frame of amot set. The data save as follows And the new version of get_frame is like I run this version with RTX2080Ti x2, batch_size=16, num_workers=0. Each iteration time cost is about 70 sec. But still, GPU is has a long time waiting cpu. Any further advice? |
In your configure, each iteration will load 128 frames. How about this configure as follows:
|
I think the data loading time has some connection with other process in cpu. There are two model running on different GPU, each uses number_workkers>0, it may lead to the long waiting time. And I use your setting, the time varies from 5 sec to 256 sec. |
GPU:RTX2080Ti
pytorch:1.5
CUDA:10.2
dataset:amot
loading configure: config_gpu4_amot.json========
{
"dataset_name": "AMOTD",
"dataset_path": "~/data/omot_partial_dataset",
"phase": "train",
"frame_max_input_num": 16,
"frame_sample_scale": 2,
"parameter_frame_scale": 0.25,
"random_select_frame": false,
"min_valid_node_rate": 0.15,
"num_classes": 2,
"cuda": true,
"frame_size": 168,
"pixel_mean": [57, 52, 50],
"num_motion_model_param": 12,
"video_fps": 30.0,
"image_width": 1920,
"image_height": 1080,
"label_map": {
"vehicle": 1
},
"replace_map": {
"vehicle": 1
},
"test": {
"resume": "./test_logs/weights/ssdt67650.pth",
"dataset_type": "train",
"batch_size": 1,
"num_workers": 1,
"lr_decay_per_epoch": [1, 30, 45, 50],
"base_net_weights": null,
"log_save_folder": "./logs/test_logs/logs",
"image_save_folder": "./logs/test_logs/images",
"weights_save_folder": "./logs/test_logs/weights",
"sequence_list": "./dataset/amot/sequence_list_town02_train_part.txt",
"save_weight_per_epoch": 5,
"start_epoch": 0,
"end_epoch": 55,
"tensorboard": true,
"port": 6006,
"momentum": 0.9,
"weight_decay": 5e-4,
"gamma": 0.1,
"send_images": true,
"log_iters": true,
"run_mode": "debug",
"debug_save_image": false,
"debug_save_feature_map": false,
"save_track_data": true,
"contrast_lower": 0.5,
"contrast_upper": 1.5,
"saturation_lower": 0.5,
"saturation_upper": 1.5,
"hue_delta": 18.0,
"brightness_delta": 32,
"max_expand_ratio": 1.1,
"detect_bkg_label": 0,
"detect_top_k": 300,
"detect_conf_thresh": 0.3,
"detect_nms_thresh": 0.3,
"detect_exist_thresh": 0.5,
"tracker_min_iou_thresh": 0.001,
"tracker_min_visibility": 0.4
},
"train": {
"resume": null,
"batch_size": 8,
"num_workers": 0,
"learning_rate": 1e-3,
"lr_decay_per_epoch": [30, 50, 70, 90],
"base_net_weights": "./weights/resnext-101-64f-kinetics.pth",
"log_save_folder": "./logs/train_logs/log",
"image_save_folder": "./logs/train_logs/image",
"weights_save_folder": "./logs/train_logs/weights",
"sequence_list": "./dataset/amot/sequence_list_town02_train_part.txt",
"save_weight_per_epoch": 0.2,
"start_epoch": 0,
"end_epoch": 200,
"tensorboard": true,
"port": 6006,
"momentum": 0.9,
"weight_decay": 5e-4,
"gamma": 0.1,
"send_images": true,
"log_iters": true,
"run_mode": "release",
"debug_save_image": false,
"debug_save_feature_map": false,
"contrast_lower": 0.5,
"contrast_upper": 1.5,
"saturation_lower": 0.5,
"saturation_upper": 1.5,
"hue_delta": 18.0,
"brightness_delta": 32,
"max_expand_ratio": 1.1,
"static_possiblity": 0.05,
"loss_overlap_thresh": 0.5,
"loss_background_label": 0,
"dataset_overlap_thresh": 0.75
},
"frame_work":{
"temporal_dims": [8, 4, 2, 1, 1, 1],
"channel_dims": [256, 512, 1024, 2048, 2048, 2048],
"feature_maps": [42, 21, 11, 6, 3, 2],
"steps": [4, 8, 16, 28, 56, 84],
"min_sizes": [4, 16, 32, 64, 108, 146],
"max_sizes": [16, 32, 64, 108, 146, 176],
"aspect_ratios": [[1.5, 2], [2, 3], [2, 3], [2], [2], [2]],
"boxes_scales": [[0.8333, 0.6667, 0.5, 0.4], [0.8333, 0.5], [0.8333, 0.5], [0.5], [], []],
"variance": [0.1, 0.2],
"branch_cnn": 3,
"clip": true
},
"base_net":{
"mode": "feature",
"model_name": "resnext",
"model_depth": 101,
"resnet_shortcut": "B",
"arch": "resnext-101"
}
}
reading: ~/data/omot_partial_dataset/train/Town02/Clear/50/Easy_Camera_8.avi: 80%|███████████████████████▏ | 4/5 [00:11<00:02, 3.00s/it]
Loading base network...
Loading base net weights into state dict...
Finish
Timer: 8.9696 sec.
iter 0, 3078 || epoch: 0.0000 || Loss: 1485.7682 || Saving weights, iter: 0
Timer: 1.1191 sec.
iter 10, 3078 || epoch: 0.0032 || Loss: 747.8365 || Timer: 1.3092 sec.
iter 20, 3078 || epoch: 0.0065 || Loss: 543.7720 || Timer: 0.9543 sec.
iter 30, 3078 || epoch: 0.0097 || Loss: 483.3675 || Timer: 1.3921 sec.
iter 40, 3078 || epoch: 0.0130 || Loss: 365.0275 || Timer: 0.8044 sec.
iter 50, 3078 || epoch: 0.0162 || Loss: 326.6543 || Timer: 0.9312 sec.
iter 60, 3078 || epoch: 0.0195 || Loss: 366.3070 || Timer: 1.2947 sec.
iter 70, 3078 || epoch: 0.0227 || Loss: 369.2639 || Timer: 1.1285 sec.
iter 80, 3078 || epoch: 0.0260 || Loss: 350.6907 || Timer: 0.8387 sec.
iter 90, 3078 || epoch: 0.0292 || Loss: 279.8749 || Timer: 0.9261 sec.
iter 100, 3078 || epoch: 0.0325 || Loss: 373.5823 || Timer: 0.9241 sec.
iter 110, 3078 || epoch: 0.0357 || Loss: 280.2119 || Timer: 1.1693 sec.
iter 120, 3078 || epoch: 0.0390 || Loss: 273.2498 || Timer: 1.5808 sec.
iter 130, 3078 || epoch: 0.0422 || Loss: 292.5098 || Timer: 0.8698 sec.
iter 140, 3078 || epoch: 0.0455 || Loss: 318.0992 || Timer: 1.0063 sec.
iter 150, 3078 || epoch: 0.0487 || Loss: 252.3025 || Timer: 0.9630 sec.
iter 160, 3078 || epoch: 0.0520 || Loss: 265.3047 || Timer: 1.4840 sec.
iter 170, 3078 || epoch: 0.0552 || Loss: 302.5854 || Timer: 0.9548 sec.
iter 180, 3078 || epoch: 0.0585 || Loss: 285.4070 || Timer: 0.9854 sec.
iter 190, 3078 || epoch: 0.0617 || Loss: 364.6956 || Timer: 4.4852 sec.
iter 200, 3078 || epoch: 0.0650 || Loss: 330.2629 || Timer: 1.0476 sec.
iter 210, 3078 || epoch: 0.0682 || Loss: 230.9167 || Timer: 0.9508 sec.
iter 220, 3078 || epoch: 0.0715 || Loss: 244.9536 || Timer: 0.8630 sec.
iter 230, 3078 || epoch: 0.0747 || Loss: 251.5319 || Timer: 0.8497 sec.
iter 240, 3078 || epoch: 0.0780 || Loss: 263.2910 || Timer: 0.8375 sec.
iter 250, 3078 || epoch: 0.0812 || Loss: 249.8669 || Timer: 0.9519 sec.
iter 260, 3078 || epoch: 0.0845 || Loss: 240.7322 || Timer: 1.1921 sec.
iter 270, 3078 || epoch: 0.0877 || Loss: 263.4745 || Timer: 1.1276 sec.
iter 280, 3078 || epoch: 0.0910 || Loss: 231.0480 || Timer: 1.2120 sec.
iter 290, 3078 || epoch: 0.0942 || Loss: 319.0433 || Timer: 0.7737 sec.
iter 300, 3078 || epoch: 0.0975 || Loss: 253.2475 || Timer: 1.1360 sec.
iter 310, 3078 || epoch: 0.1007 || Loss: 197.9579 || Timer: 0.9226 sec.
iter 320, 3078 || epoch: 0.1040 || Loss: 269.4134 || Timer: 0.9375 sec.
The text was updated successfully, but these errors were encountered: