diff --git a/docs/module_usage/tutorials/ocr_modules/layout_detection.en.md b/docs/module_usage/tutorials/ocr_modules/layout_detection.en.md index b730ca3f1..106c38860 100644 --- a/docs/module_usage/tutorials/ocr_modules/layout_detection.en.md +++ b/docs/module_usage/tutorials/ocr_modules/layout_detection.en.md @@ -31,6 +31,14 @@ The core task of structure analysis is to parse and segment the content of input An efficient layout area localization model trained on the PubLayNet dataset based on PicoDet-1x can locate five types of areas, including text, titles, tables, images, and lists. +PicoDet_layout_1x_table推理模型/训练模型 +95.7 +12.623 +90.8934 +7.4 M +An efficient layout area localization model trained on the PubLayNet dataset based on PicoDet-1x can locate one type of tables. + + PicoDet-S_layout_3clsInference Model/Trained Model 87.1 13.5 diff --git a/docs/module_usage/tutorials/ocr_modules/layout_detection.md b/docs/module_usage/tutorials/ocr_modules/layout_detection.md index f69dbd2d0..84fb1f818 100644 --- a/docs/module_usage/tutorials/ocr_modules/layout_detection.md +++ b/docs/module_usage/tutorials/ocr_modules/layout_detection.md @@ -31,6 +31,14 @@ comments: true 基于PicoDet-1x在PubLayNet数据集训练的高效率版面区域定位模型,可定位包含文字、标题、表格、图片以及列表这5类区域 +PicoDet_layout_1x_table推理模型/训练模型 +95.7 +12.623 +90.8934 +7.4 M +基于PicoDet-1x在自建数据集训练的高效率版面区域定位模型,可定位包含表格这1类区域 + + PicoDet-S_layout_3cls推理模型/训练模型 87.1 13.5 diff --git a/docs/pipeline_usage/tutorials/information_extraction_pipelines/document_scene_information_extraction.en.md b/docs/pipeline_usage/tutorials/information_extraction_pipelines/document_scene_information_extraction.en.md index bec85e481..b658c6234 100644 --- a/docs/pipeline_usage/tutorials/information_extraction_pipelines/document_scene_information_extraction.en.md +++ b/docs/pipeline_usage/tutorials/information_extraction_pipelines/document_scene_information_extraction.en.md @@ -66,6 +66,14 @@ The PP-ChatOCRv3-doc pipeline includes modules for Table Structure Rec An efficient layout area localization model trained on the PubLayNet dataset based on PicoDet-1x can locate five types of areas, including text, titles, tables, images, and lists. +PicoDet_layout_1x_table推理模型/训练模型 +95.7 +12.623 +90.8934 +7.4 M +An efficient layout area localization model trained on the PubLayNet dataset based on PicoDet-1x can locate one type of tables. + + PicoDet-S_layout_3clsInference Model/Trained Model 87.1 13.5 diff --git a/docs/pipeline_usage/tutorials/information_extraction_pipelines/document_scene_information_extraction.md b/docs/pipeline_usage/tutorials/information_extraction_pipelines/document_scene_information_extraction.md index 24384c492..a52df5739 100644 --- a/docs/pipeline_usage/tutorials/information_extraction_pipelines/document_scene_information_extraction.md +++ b/docs/pipeline_usage/tutorials/information_extraction_pipelines/document_scene_information_extraction.md @@ -66,6 +66,14 @@ comments: true 基于PicoDet-1x在PubLayNet数据集训练的高效率版面区域定位模型,可定位包含文字、标题、表格、图片以及列表这5类区域 +PicoDet_layout_1x_table推理模型/训练模型 +95.7 +12.623 +90.8934 +7.4 M +基于PicoDet-1x在自建数据集训练的高效率版面区域定位模型,可定位包含表格这1类区域 + + PicoDet-S_layout_3cls推理模型/训练模型 87.1 13.5 diff --git a/docs/pipeline_usage/tutorials/ocr_pipelines/layout_parsing.en.md b/docs/pipeline_usage/tutorials/ocr_pipelines/layout_parsing.en.md index 84bb8d190..741d7d89d 100644 --- a/docs/pipeline_usage/tutorials/ocr_pipelines/layout_parsing.en.md +++ b/docs/pipeline_usage/tutorials/ocr_pipelines/layout_parsing.en.md @@ -64,6 +64,14 @@ The General Layout Parsing Pipeline includes modules for table structure An efficient layout area localization model trained on the PubLayNet dataset based on PicoDet-1x can locate five types of areas, including text, titles, tables, images, and lists. +PicoDet_layout_1x_table推理模型/训练模型 +95.7 +12.623 +90.8934 +7.4 M +An efficient layout area localization model trained on the PubLayNet dataset based on PicoDet-1x can locate one type of tables. + + PicoDet-S_layout_3clsInference Model/Trained Model 87.1 13.5 diff --git a/docs/pipeline_usage/tutorials/ocr_pipelines/layout_parsing.md b/docs/pipeline_usage/tutorials/ocr_pipelines/layout_parsing.md index 2e8c8c319..2967bbd9b 100644 --- a/docs/pipeline_usage/tutorials/ocr_pipelines/layout_parsing.md +++ b/docs/pipeline_usage/tutorials/ocr_pipelines/layout_parsing.md @@ -64,6 +64,14 @@ comments: true 基于PicoDet-1x在PubLayNet数据集训练的高效率版面区域定位模型,可定位包含文字、标题、表格、图片以及列表这5类区域 +PicoDet_layout_1x_table推理模型/训练模型 +95.7 +12.623 +90.8934 +7.4 M +基于PicoDet-1x在自建数据集训练的高效率版面区域定位模型,可定位包含表格这1类区域 + + PicoDet-S_layout_3cls推理模型/训练模型 87.1 13.5 diff --git a/docs/pipeline_usage/tutorials/ocr_pipelines/seal_recognition.en.md b/docs/pipeline_usage/tutorials/ocr_pipelines/seal_recognition.en.md index b708defb7..3c4c3c456 100644 --- a/docs/pipeline_usage/tutorials/ocr_pipelines/seal_recognition.en.md +++ b/docs/pipeline_usage/tutorials/ocr_pipelines/seal_recognition.en.md @@ -36,6 +36,14 @@ The Seal Recognition pipeline includes a layout area analysis module, a s An efficient layout area localization model trained on the PubLayNet dataset based on PicoDet-1x can locate five types of areas, including text, titles, tables, images, and lists. +PicoDet_layout_1x_table推理模型/训练模型 +95.7 +12.623 +90.8934 +7.4 M +An efficient layout area localization model trained on the PubLayNet dataset based on PicoDet-1x can locate one type of tables. + + PicoDet-S_layout_3clsInference Model/Trained Model 87.1 13.5 diff --git a/docs/pipeline_usage/tutorials/ocr_pipelines/seal_recognition.md b/docs/pipeline_usage/tutorials/ocr_pipelines/seal_recognition.md index b0e9e7360..6d4279cef 100644 --- a/docs/pipeline_usage/tutorials/ocr_pipelines/seal_recognition.md +++ b/docs/pipeline_usage/tutorials/ocr_pipelines/seal_recognition.md @@ -38,6 +38,14 @@ comments: true 基于PicoDet-1x在PubLayNet数据集训练的高效率版面区域定位模型,可定位包含文字、标题、表格、图片以及列表这5类区域 +PicoDet_layout_1x_table推理模型/训练模型 +95.7 +12.623 +90.8934 +7.4 M +基于PicoDet-1x在自建数据集训练的高效率版面区域定位模型,可定位包含表格这1类区域 + + PicoDet-S_layout_3cls推理模型/训练模型 87.1 13.5 diff --git a/docs/pipeline_usage/tutorials/ocr_pipelines/table_recognition.en.md b/docs/pipeline_usage/tutorials/ocr_pipelines/table_recognition.en.md index 4c5db29c4..9fb1e27dd 100644 --- a/docs/pipeline_usage/tutorials/ocr_pipelines/table_recognition.en.md +++ b/docs/pipeline_usage/tutorials/ocr_pipelines/table_recognition.en.md @@ -70,6 +70,14 @@ SLANet_plus is an enhanced version of SLANet, a table structure recognition mode An efficient layout area localization model trained on the PubLayNet dataset based on PicoDet-1x can locate five types of areas, including text, titles, tables, images, and lists. +PicoDet_layout_1x_table推理模型/训练模型 +95.7 +12.623 +90.8934 +7.4 M +An efficient layout area localization model trained on the PubLayNet dataset based on PicoDet-1x can locate one type of tables. + + PicoDet-S_layout_3clsInference Model/Trained Model 87.1 13.5 diff --git a/docs/pipeline_usage/tutorials/ocr_pipelines/table_recognition.md b/docs/pipeline_usage/tutorials/ocr_pipelines/table_recognition.md index 68d51bc34..319a690d3 100644 --- a/docs/pipeline_usage/tutorials/ocr_pipelines/table_recognition.md +++ b/docs/pipeline_usage/tutorials/ocr_pipelines/table_recognition.md @@ -65,6 +65,14 @@ comments: true 基于PicoDet-1x在PubLayNet数据集训练的高效率版面区域定位模型,可定位包含文字、标题、表格、图片以及列表这5类区域 +PicoDet_layout_1x_table推理模型/训练模型 +95.7 +12.623 +90.8934 +7.4 M +基于PicoDet-1x在自建数据集训练的高效率版面区域定位模型,可定位包含表格这1类区域 + + PicoDet-S_layout_3cls推理模型/训练模型 87.1 13.5 diff --git a/docs/practical_tutorials/document_scene_information_extraction(layout_detection)_tutorial.en.md b/docs/practical_tutorials/document_scene_information_extraction(layout_detection)_tutorial.en.md index a41345a98..c228cd3e9 100644 --- a/docs/practical_tutorials/document_scene_information_extraction(layout_detection)_tutorial.en.md +++ b/docs/practical_tutorials/document_scene_information_extraction(layout_detection)_tutorial.en.md @@ -109,6 +109,14 @@ PaddleX provides 4 end-to-end layout detection models, which can be referenced i An efficient layout area localization model trained on the PubLayNet dataset based on PicoDet-1x can locate five types of areas, including text, titles, tables, images, and lists. +PicoDet_layout_1x_table +95.7 +12.623 +90.8934 +7.4 M +An efficient layout area localization model trained on the PubLayNet dataset based on PicoDet-1x can locate one type of tables. + + PicoDet-S_layout_3cls 87.1 13.5 diff --git a/docs/practical_tutorials/document_scene_information_extraction(layout_detection)_tutorial.md b/docs/practical_tutorials/document_scene_information_extraction(layout_detection)_tutorial.md index 75ab34b51..39893102b 100644 --- a/docs/practical_tutorials/document_scene_information_extraction(layout_detection)_tutorial.md +++ b/docs/practical_tutorials/document_scene_information_extraction(layout_detection)_tutorial.md @@ -114,6 +114,14 @@ PaddleX 提供了 4 个端到端的版面区域定位模型,具体可参考 [ 基于PicoDet-1x在PubLayNet数据集训练的高效率版面区域定位模型,可定位包含文字、标题、表格、图片以及列表这5类区域 +PicoDet_layout_1x_table +95.7 +12.623 +90.8934 +7.4 M +基于PicoDet-1x在自建数据集训练的高效率版面区域定位模型,可定位包含表格1个类别 + + PicoDet-S_layout_3cls 87.1 13.5 diff --git a/docs/support_list/models_list.en.md b/docs/support_list/models_list.en.md index 4567c795c..9a38cc974 100644 --- a/docs/support_list/models_list.en.md +++ b/docs/support_list/models_list.en.md @@ -1963,6 +1963,14 @@ PaddleX incorporates multiple pipelines, each containing several modules, and ea PicoDet_layout_1x.yaml Inference Model/Trained Model +PicoDet_layout_1x_table +95.7 +12.623 +90.8934 +7.4 M +PicoDet_layout_1x_table.yaml +推理模型/训练模型 + PicoDet-S_layout_3cls 87.1 13.521 diff --git a/docs/support_list/models_list.md b/docs/support_list/models_list.md index a50176ec5..b8a8a86bd 100644 --- a/docs/support_list/models_list.md +++ b/docs/support_list/models_list.md @@ -1961,6 +1961,14 @@ PaddleX 内置了多条产线,每条产线都包含了若干模块,每个模 PicoDet_layout_1x.yaml 推理模型/训练模型 +PicoDet_layout_1x_table +95.7 +12.623 +90.8934 +7.4 M +PicoDet_layout_1x_table.yaml +推理模型/训练模型 + PicoDet-S_layout_3cls 87.1 13.521 diff --git a/paddlex/configs/structure_analysis/PicoDet_layout_1x_table.yaml b/paddlex/configs/structure_analysis/PicoDet_layout_1x_table.yaml new file mode 100644 index 000000000..665eccab8 --- /dev/null +++ b/paddlex/configs/structure_analysis/PicoDet_layout_1x_table.yaml @@ -0,0 +1,40 @@ +Global: + model: PicoDet_layout_1x_table + mode: check_dataset # check_dataset/train/evaluate/predict + dataset_dir: "/paddle/dataset/paddlex/layout/det_layout_examples" + device: gpu:0,1,2,3 + output: "output" + +CheckDataset: + convert: + enable: False + src_dataset_type: null + split: + enable: False + train_percent: null + val_percent: null + +Train: + num_classes: 11 + epochs_iters: 50 + batch_size: 24 + learning_rate: 0.4 + pretrain_weight_path: https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/PicoDet_layout_1x_table_pretrained.pdparams + warmup_steps: 100 + resume_path: null + log_interval: 1 + eval_interval: 1 + +Evaluate: + weight_path: "output/best_model/best_model.pdparams" + log_interval: 10 + +Export: + weight_path: https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/PicoDet_layout_1x_table_pretrained.pdparams + +Predict: + batch_size: 1 + model_dir: "output/best_model/inference" + input: "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/layout.jpg" + kernel_option: + run_mode: paddle diff --git a/paddlex/inference/utils/official_models.py b/paddlex/inference/utils/official_models.py index d3d1c95d2..8a01d0aa0 100644 --- a/paddlex/inference/utils/official_models.py +++ b/paddlex/inference/utils/official_models.py @@ -211,6 +211,7 @@ "PP-LCNet_x1_0_vehicle_attribute": "https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0b2/\ PP-LCNet_x1_0_vehicle_attribute_infer.tar", "PicoDet_layout_1x": "https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0b2/PicoDet_layout_1x_infer.tar", + "PicoDet_layout_1x_table": "https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0b2/PicoDet_layout_1x_table_infer.tar", "SLANet": "https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0b2/SLANet_infer.tar", "SLANet_plus": "https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0b2/SLANet_plus_infer.tar", "LaTeX_OCR_rec": "https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0b2/LaTeX_OCR_rec_infer.tar", diff --git a/paddlex/modules/object_detection/model_list.py b/paddlex/modules/object_detection/model_list.py index 66f41b040..9ada86d18 100644 --- a/paddlex/modules/object_detection/model_list.py +++ b/paddlex/modules/object_detection/model_list.py @@ -26,6 +26,7 @@ "RT-DETR-R50", "RT-DETR-X", "PicoDet_layout_1x", + "PicoDet_layout_1x_table", "PicoDet-S_layout_3cls", "PicoDet-S_layout_17cls", "PicoDet-L_layout_3cls", diff --git a/paddlex/pipelines/table_recognition.yaml b/paddlex/pipelines/table_recognition.yaml index b11a4cbe4..2783d1da9 100644 --- a/paddlex/pipelines/table_recognition.yaml +++ b/paddlex/pipelines/table_recognition.yaml @@ -3,7 +3,7 @@ Global: input: https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/table_recognition.jpg Pipeline: - layout_model: PicoDet_layout_1x + layout_model: PicoDet_layout_1x_table table_model: SLANet_plus text_det_model: PP-OCRv4_mobile_det text_rec_model: PP-OCRv4_mobile_rec diff --git a/paddlex/repo_apis/PaddleDetection_api/configs/PicoDet_layout_1x_table.yaml b/paddlex/repo_apis/PaddleDetection_api/configs/PicoDet_layout_1x_table.yaml new file mode 100644 index 000000000..cedfc3675 --- /dev/null +++ b/paddlex/repo_apis/PaddleDetection_api/configs/PicoDet_layout_1x_table.yaml @@ -0,0 +1,159 @@ +# Runtime +epoch: 50 +log_iter: 1 +find_unused_parameters: true +use_gpu: true +use_xpu: false +use_mlu: false +use_npu: false +use_ema: true +save_dir: output +snapshot_epoch: 1 +cycle_epoch: 10 +print_flops: false +print_params: false + +# Dataset +metric: COCO +num_classes: 5 + +worker_num: 8 + +TrainDataset: + name: COCODetDataset + image_dir: images + anno_path: annotations/instance_train.json + dataset_dir: datasets/COCO + data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd'] + +EvalDataset: + name: COCODetDataset + image_dir: images + anno_path: annotations/instance_val.json + dataset_dir: datasets/COCO + allow_empty: true + +TestDataset: + name: ImageFolder + anno_path: annotations/instance_val.json + dataset_dir: datasets/COCO + +TrainReader: + sample_transforms: + - Decode: {} + - RandomCrop: {} + - RandomFlip: {prob: 0.5} + - RandomDistort: {} + batch_transforms: + - BatchRandomResize: {target_size: [[768, 576], [800, 608], [832, 640]], random_size: True, random_interp: True, keep_ratio: False} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Permute: {} + batch_size: 24 + shuffle: true + drop_last: true + collate_batch: false + +EvalReader: + sample_transforms: + - Decode: {} + - Resize: {interp: 2, target_size: [800, 608], keep_ratio: False} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Permute: {} + batch_transforms: + - PadBatch: {pad_to_stride: 32} + batch_size: 8 + shuffle: false + +TestReader: + inputs_def: + image_shape: [3, 800, 608] + sample_transforms: + - Decode: {} + - Resize: {interp: 2, target_size: [800, 608], keep_ratio: False} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Permute: {} + batch_transforms: + - PadBatch: {pad_to_stride: 32} + batch_size: 1 + shuffle: false + +# Model +architecture: PicoDet +pretrain_weights: https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout.pdparams + +PicoDet: + backbone: LCNet + neck: CSPPAN + head: PicoHead + nms_cpu: true + +LCNet: + scale: 1.0 + feature_maps: [3, 4, 5] + +CSPPAN: + out_channels: 128 + use_depthwise: True + num_csp_blocks: 1 + num_features: 4 + +PicoHead: + conv_feat: + name: PicoFeat + feat_in: 128 + feat_out: 128 + num_convs: 4 + num_fpn_stride: 4 + norm_type: bn + share_cls_reg: True + fpn_stride: [8, 16, 32, 64] + feat_in_chan: 128 + prior_prob: 0.01 + reg_max: 7 + cell_offset: 0.5 + loss_class: + name: VarifocalLoss + use_sigmoid: True + iou_weighted: True + loss_weight: 1.0 + loss_dfl: + name: DistributionFocalLoss + loss_weight: 0.25 + loss_bbox: + name: GIoULoss + loss_weight: 2.0 + assigner: + name: SimOTAAssigner + candidate_topk: 10 + iou_weight: 6 + nms: + name: MultiClassNMS + nms_top_k: 1000 + keep_top_k: 100 + score_threshold: 0.025 + nms_threshold: 0.6 + +# Optimizer +LearningRate: + base_lr: 0.4 + schedulers: + - name: CosineDecay + max_epochs: 100 + - name: LinearWarmup + start_factor: 0.1 + steps: 100 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.00004 + type: L2 + +# Export +export: + post_process: true + nms: true + benchmark: false + fuse_conv_bn: false diff --git a/paddlex/repo_apis/PaddleDetection_api/object_det/official_categories.py b/paddlex/repo_apis/PaddleDetection_api/object_det/official_categories.py index 0344a688f..a571d28e5 100644 --- a/paddlex/repo_apis/PaddleDetection_api/object_det/official_categories.py +++ b/paddlex/repo_apis/PaddleDetection_api/object_det/official_categories.py @@ -25,6 +25,9 @@ {"name": "Table", "id": 3}, {"name": "Figure", "id": 4}, ], + "PicoDet_layout_1x_table": [ + {"name": "Table", "id": 0}, + ], "PicoDet-S_layout_3cls": [ {"name": "image", "id": 0}, {"name": "table", "id": 1}, diff --git a/paddlex/repo_apis/PaddleDetection_api/object_det/register.py b/paddlex/repo_apis/PaddleDetection_api/object_det/register.py index 3be60ec03..d42ed99c6 100644 --- a/paddlex/repo_apis/PaddleDetection_api/object_det/register.py +++ b/paddlex/repo_apis/PaddleDetection_api/object_det/register.py @@ -217,6 +217,21 @@ } ) +register_model_info( + { + "model_name": "PicoDet_layout_1x_table", + "suite": "Det", + "config_path": osp.join(PDX_CONFIG_DIR, "PicoDet_layout_1x_table.yaml"), + "supported_apis": ["train", "evaluate", "predict", "export"], + "supported_dataset_types": ["COCODetDataset"], + "supported_train_opts": { + "device": ["cpu", "gpu_nxcx", "xpu", "npu", "mlu"], + "dy2st": False, + "amp": ["OFF"], + }, + } +) + register_model_info( { "model_name": "YOLOv3-DarkNet53",