diff --git a/alexi/__init__.py b/alexi/__init__.py index 32c113f..b42ed93 100644 --- a/alexi/__init__.py +++ b/alexi/__init__.py @@ -23,7 +23,7 @@ from .label import Identificateur from .search import search from .segment import DEFAULT_MODEL as DEFAULT_SEGMENT_MODEL -from .segment import RNNSegmenteur, Segmenteur +from .segment import Segmenteur LOGGER = logging.getLogger("alexi") VERSION = "0.4.0" @@ -60,10 +60,7 @@ def convert_main(args: argparse.Namespace): def segment_main(args: argparse.Namespace): """Segmenter un CSV""" crf: Segmenteur - if args.model.suffix == ".pt": - crf = RNNSegmenteur(args.model) - else: - crf = Segmenteur(args.model) + crf = Segmenteur(args.model) reader = csv.DictReader(args.csv) write_csv(crf(reader), sys.stdout) @@ -144,7 +141,7 @@ def make_argparse() -> argparse.ArgumentParser: "segment", help="Segmenter et étiquetter les segments d'un CSV" ) segment.add_argument( - "--model", help="Modele CRF ou RNN", type=Path, default=DEFAULT_SEGMENT_MODEL + "--model", help="Modele CRF", type=Path, default=DEFAULT_SEGMENT_MODEL ) segment.add_argument( "csv", diff --git a/alexi/extract.py b/alexi/extract.py index 9b1d190..f36272c 100644 --- a/alexi/extract.py +++ b/alexi/extract.py @@ -19,7 +19,7 @@ from alexi.label import Identificateur from alexi.link import Resolver from alexi.segment import DEFAULT_MODEL as DEFAULT_SEGMENT_MODEL -from alexi.segment import DEFAULT_MODEL_NOSTRUCT, RNNSegmenteur, Segmenteur +from alexi.segment import DEFAULT_MODEL_NOSTRUCT, Segmenteur from alexi.types import T_obj LOGGER = logging.getLogger("extract") @@ -39,7 +39,7 @@ def add_arguments(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: help="Ne pas utiliser le CSV de référence", action="store_true", ) - parser.add_argument("--segment-model", help="Modele CRF/RNN", type=Path) + parser.add_argument("--segment-model", help="Modele CRF", type=Path) parser.add_argument( "--label-model", help="Modele CRF", type=Path, default=DEFAULT_LABEL_MODEL ) @@ -342,10 +342,7 @@ def __init__( self.outdir = outdir self.crf_s = Identificateur() if segment_model is not None: - if segment_model.suffix == ".pt": - self.crf = RNNSegmenteur(segment_model) - else: - self.crf = Segmenteur(segment_model) + self.crf = Segmenteur(segment_model) self.crf_n = None else: self.crf = Segmenteur(DEFAULT_SEGMENT_MODEL) diff --git a/alexi/models/rnn.json b/alexi/models/rnn.json deleted file mode 100644 index 0d6f01f..0000000 --- a/alexi/models/rnn.json +++ /dev/null @@ -1,1517 +0,0 @@ -{ - "feat2id": { - "lower": { - "": 0, - "de": 1, - "la": 2, - "et": 3, - "le": 4, - "les": 5, - "à": 6, - "du": 7, - "ou": 8, - "des": 9, - "en": 10, - "un": 11, - "règlement": 12, - "une": 13, - "par": 14, - "pour": 15, - "au": 16, - "sur": 17, - "d’un": 18, - "aux": 19, - "dans": 20, - ":": 21, - "est": 22, - "permis": 23, - "présent": 24, - "dispositions": 25, - "que": 26, - "d’une": 27, - "être": 28, - "demande": 29, - "construction": 30, - "travaux": 31, - "doit": 32, - "–": 33, - "chapitre": 34, - "cas": 35, - "sont": 36, - "critères": 37, - "ville": 38, - "qui": 39, - "terrain": 40, - "tout": 41, - "objectifs": 42, - "relatifs": 43, - "toute": 44, - "plan": 45, - "pas": 46, - "numéro": 47, - "ne": 48, - "a": 49, - "projet": 50, - "bâtiment": 51, - "1": 52, - "«": 53, - "documents": 54, - "certificat": 55, - "2": 56, - "vigueur": 57, - "1.": 58, - "ce": 59, - "si": 60, - "lotissement": 61, - "2.": 62, - "autre": 63, - "section": 64, - ";": 65, - "peut": 66, - "conseil": 67, - "modification": 68, - "certificats": 69, - "lot": 70, - "fonctionnaire": 71, - "3.": 72, - "»": 73, - "cette": 74, - "plus": 75, - "avec": 76, - "relatif": 77, - "relatives": 78, - "vertu": 79, - "loi": 80, - "d'un": 81, - "requis": 82, - "entre": 83, - "suivants": 84, - "plans": 85, - "désigné": 86, - "partie": 87, - "cadastrale": 88, - "3": 89, - "doivent": 90, - "l’aménagement": 91, - "été": 92, - "4.": 93, - "rue": 94, - "domaine": 95, - "bâtiments": 96, - "renseignements": 97, - "ainsi": 98, - "tableau": 99, - "article": 100, - "d'une": 101, - "lorsque": 102, - "d’évaluation": 103, - "où": 104, - "personne": 105, - "zonage": 106, - "5.": 107, - "tous": 108, - "leur": 109, - "4": 110, - "règlements": 111, - "requérant": 112, - "architecturale": 113, - "c.": 114, - "conformément": 115, - "terrains": 116, - "ces": 117, - "y": 118, - "a.": 119, - "d’autorisation": 120, - "superficie": 121, - "public": 122, - "autorisation": 123, - "1314-2021-piia": 124, - "rues": 125, - "piia": 126, - "5": 127, - "texte": 128, - "conditions": 129, - "eaux": 130, - "dimensions": 131, - "type": 132, - "6": 133, - "non": 134, - "suivantes": 135, - "territoire": 136, - "disposition": 137, - "il": 138, - "localisation": 139, - "même": 140, - "fait": 141, - "principal": 142, - "comité": 143, - "construction,": 144, - "moins": 145, - "résolution": 146, - "sous-section": 147, - "6.": 148, - "règlement,": 149, - "son": 150, - "mètres": 151, - "faire": 152, - "sainte-adèle": 153, - "règlement.": 154, - "dont": 155, - "visés": 156, - "selon": 157, - "10": 158, - "sa": 159, - "soit": 160, - "fins": 161, - "opération": 162, - "d’implantation": 163, - "interprétatives": 164, - "comme": 165, - "ses": 166, - "administratives": 167, - "lots": 168, - "publique": 169, - "1314-2021-pc": 170, - "sainte-adèle.": 171, - "échéant,": 172, - "cours": 173, - "titre": 174, - "projets": 175, - "toutes": 176, - "spécifiques": 177, - "milieux": 178, - "se": 179, - "l’objet": 180, - "-": 181, - "visé": 182, - "circulation": 183, - "l’emprise": 184, - "s’appliquent": 185, - "copie": 186, - "conforme": 187, - "développement": 188, - "b.": 189, - "document": 190, - "l’ensemble": 191, - "d’urbanisme": 192, - "cadre": 193, - "dérogation": 194, - "l’implantation": 195, - "zones": 196, - "d'occupation": 197, - "milieu": 198, - "incluant": 199, - "municipaux": 200, - "déclaratoires,": 201, - "respecter": 202, - "7.": 203, - "voie": 204, - "sous": 205, - "propriétaire": 206, - "types": 207, - "sans": 208, - "interventions": 209, - "rapport": 210, - "objectif": 211, - "délivrance": 212, - "n’est": 213, - "d’eau": 214, - "municipal": 215, - "district": 216, - "qu’il": 217, - "services": 218, - "chaque": 219, - "parc": 220, - "l’intérieur": 221, - "additionnels": 222, - "7": 223, - "droits": 224, - "permettre": 225, - "promoteur": 226, - "situé": 227, - "cession": 228, - "l’article": 229, - "peuvent": 230, - "générales": 231, - "d’intégration": 232, - "sol": 233, - "autres": 234, - "8": 235, - "conformité": 236, - "assujettis": 237, - "ouvrages": 238, - "présente": 239, - "avant": 240, - "15": 241, - "zone": 242, - "immeuble": 243, - "ayant": 244, - "signé": 245, - "s’applique": 246, - "remplacement": 247, - "terrain,": 248, - "espaces": 249, - "20": 250, - "9": 251, - "montrant": 252, - "constructions": 253, - "8.": 254, - "ligne": 255, - "lors": 256, - "limites": 257, - "suivant": 258, - "30": 259, - "usage": 260, - "ouvrage": 261, - "applicables": 262, - "s’il": 263, - "contradiction": 264, - "sens": 265, - "nouvelle": 266, - "réseau": 267, - "distance": 268, - "voies": 269, - "également": 270, - "(s)": 271, - "projetée": 272, - "associés": 273, - "règles": 274, - "date": 275, - "leurs": 276, - "frais": 277, - "système": 278, - "stationnement": 279, - "l’usage": 280, - "d’interventions": 281, - "sécurité": 282, - "lui": 283, - "entrée": 284, - "infrastructures": 285, - "plus,": 286, - "audrey": 287, - "usages": 288, - "table": 289, - "l’intégration": 290, - "matériaux": 291, - "conseiller": 292, - "l'autorisation": 293, - "d'autorisation": 294, - "(l.r.q.,": 295, - "attendu": 296, - "niveau": 297, - "9.": 298, - "garantie": 299, - "nadine": 300, - "intégré": 301, - "durable": 302, - "lieu": 303, - "monsieur": 304, - "11": 305, - "12": 306, - "informations": 307, - "propriété": 308, - "l’application": 309, - "effet": 310, - "ville,": 311, - "droit": 312, - "jours": 313, - "tel": 314, - "consultatif": 315, - "17": 316, - "10.": 317, - "l’opération": 318, - "21": 319, - "d’aménagement": 320, - "senécal": 321, - "durée": 322, - "structure": 323, - "propriétaire,": 324, - "d’affaires": 325, - "fiche": 326, - "l’emplacement": 327, - "d.": 328, - "membre": 329, - "suite": 330, - "l’approbation": 331, - "mineure": 332, - "deux": 333, - "l’autorité": 334, - "mrc": 335, - "ingénieur": 336, - "plusieurs": 337, - "vigueur.": 338, - "cet": 339, - "paragraphe": 340, - "valeur": 341, - "qualité": 342, - "l'occupant": 343, - "mesures": 344, - "compétente": 345, - "agrandissement": 346, - "québec": 347, - "elle": 348, - "100": 349, - "protection": 350, - "nécessaires": 351, - "réalisation": 352, - "aucun": 353, - "avoir": 354, - "personnes": 355, - "d’accès": 356, - "visant": 357, - "permettant": 358, - "19": 359, - "fin": 360, - "projeté": 361, - "pourvu": 362, - "territoires": 363, - "sainte-adèle,": 364, - "ont": 365, - "professionnel": 366, - "tracé": 367, - "%": 368, - "ni": 369, - "brière": 370, - "analyse": 371, - "pente": 372, - "vigueur;": 373, - "format": 374, - "d’évaluer": 375, - "laquelle": 376, - "exigences": 377, - "contribution": 378, - "allées": 379, - "éléments": 380, - "travaux,": 381, - "acquis": 382, - "infraction": 383, - "annexe": 384, - "électronique": 385, - "membres": 386, - "délai": 387, - "description": 388, - "limite": 389, - "démolition": 390, - "avis": 391, - "aucune": 392, - "requise": 393, - "cas,": 394, - "séance": 395, - "ordinaire": 396, - "forme": 397, - "faisant": 398, - "devoirs": 399, - "accompagnée": 400, - "demande.": 401, - "contrôle": 402, - "aménagement": 403, - "sera": 404, - "l’identification": 405, - "nature": 406, - "soient": 407, - "afin": 408, - "ville.": 409, - "comprend": 410, - "majeur": 411, - "50": 412, - "devis": 413, - "d’intérêt": 414, - "prévus": 415, - "site": 416, - "avril": 417, - "interprétation": 418, - "sauf": 419, - "normes": 420, - "existante": 421, - "minimale": 422, - "pays-d’en-haut.": 423, - "linéaire": 424, - "assujetties": 425, - "chapitre,": 426, - "pouvoirs": 427, - "prévues": 428, - "exigés": 429, - "fournir": 430, - "mairesse": 431, - "modifications": 432, - "lequel": 433, - "l’environnement": 434, - "traitement": 435, - "applicable": 436, - "service": 437, - "public,": 438, - "code": 439, - "000": 440, - "ministère": 441, - "tableaux": 442, - "largeur": 443, - "humide": 444, - "règlementation": 445, - "l’échelle": 446, - "patrimonial": 447, - "l'occupation": 448, - "11.": 449, - "montant": 450, - "accès": 451, - "nombre": 452, - "qu’une": 453, - "e.": 454, - "existants": 455, - "contenu": 456, - "jour": 457, - "mois": 458, - "alors": 459, - "période": 460, - "règlement;": 461, - "espace": 462, - "fonction": 463, - "indiquant": 464, - "ententes": 465, - "l’adresse": 466, - "tenue": 467, - "lecture": 468, - "qu'il": 469, - "a)": 470, - "nom": 471, - "paiement": 472, - "mise": 473, - "l’immeuble": 474, - "1314-2021-l": 475, - "pdf": 476, - "relative": 477, - "s'il": 478, - "13": 479, - "notamment": 480, - "équipements": 481, - "signature": 482, - "obligation": 483, - "l’agrandissement": 484, - "changement": 485, - "lieux": 486, - "seul": 487, - "figures": 488, - "d’aqueduc": 489, - "lac": 490, - "suffisante": 491, - "toit": 492, - "maison": 493, - "b)": 494, - "14": 495, - "surface": 496, - "projetés": 497, - "logements": 498, - "16": 499, - "dépôt": 500, - "18": 501, - "loi.": 502, - "d’ensemble": 503, - "nord": 504, - "compréhension": 505, - "12.": 506, - "l’ordre": 507, - "l’accès": 508, - "vente": 509, - "état": 510, - "train": 511, - "bâtiment,": 512, - "l’installation": 513, - "l’entretien": 514, - "assujetti": 515, - "comprenant": 516, - "existantes": 517, - "après": 518, - "caractérisation": 519, - "délais": 520, - "articles": 521, - "me": 522, - "l’avis": 523, - "parties": 524, - "»,": 525, - "certains": 526, - "permis,": 527, - "nouveau": 528, - "constitue": 529, - "».": 530, - "exécutés": 531, - "accordée": 532, - "réserve": 533, - "tolérance": 534, - "l’annexe": 535, - "p’tit": 536, - "hautes": 537, - "extérieur": 538, - "particulières": 539, - "compris": 540, - "architecturaux": 541, - "étude": 542, - "lieu;": 543, - "minimales": 544, - "décision": 545, - "échéant;": 546, - "celui-ci": 547, - "moment": 548, - "l’égard": 549, - "directeur": 550, - "lois": 551, - "matières,": 552, - "l’expression": 553, - "préparé": 554, - "généraux": 555, - "dérogatoire": 556, - "cadastre": 557, - "techniques": 558, - "l’autorisation": 559, - "murs": 560, - "secteur": 561, - "terrain;": 562, - "conformes": 563, - "qu’un": 564, - "1314-2021-z": 565, - "13.": 566, - "autorisés": 567, - "élément": 568, - "lignes": 569, - "physique": 570, - "24": 571, - "intitulé": 572, - "l’emploi": 573, - "entente": 574, - "additionnel": 575, - "sommet": 576, - "c)": 577, - "drainage": 578, - "composantes": 579, - "prolongement": 580, - "présentation": 581, - "caractéristiques": 582, - "f.": 583, - "greffière": 584, - "existant": 585, - "référence": 586, - "réalisés": 587, - "civil": 588, - "modalités": 589, - "défaut": 590, - "temporaire": 591, - "bordure": 592, - "tarif": 593, - "vigueur,": 594, - "l’une": 595, - "atteints": 596, - "22": 597, - "auquel": 598, - "coupe": 599, - "stationnements": 600, - "versant": 601, - "toiture": 602, - "madame": 603, - "formant": 604, - "greffier": 605, - "motion": 606, - "hauteur": 607, - "remise": 608, - "immeubles": 609, - "l’urbanisme": 610, - "habitation": 611, - "manière": 612, - "morale": 613, - "mesure": 614, - "aménagements": 615, - "quiconque": 616, - "ceux": 617, - "l’occupant,": 618, - "responsable": 619, - "terme": 620, - "l’entrée": 621, - "adjacent": 622, - "arrière": 623, - "1314-2021-tm": 624, - "mot": 625, - "objet": 626, - "autorisé": 627, - "déclaration": 628, - "demande;": 629, - "naturel": 630, - "réglementation": 631, - "14.": 632, - "nécessaire": 633, - "fois": 634, - "directrice": 635, - "juridiques": 636, - "d’occupation": 637, - "logement": 638, - "place": 639, - "partir": 640, - "$": 641, - "d’autres": 642, - "hors": 643, - "accompagner": 644, - "matières": 645, - "prévaut;": 646, - "23": 647, - "couleur,": 648, - "municipal,": 649, - "terminologie": 650, - "n'est": 651, - "application": 652, - "recours": 653, - "celles": 654, - "approuvé": 655, - "demande,": 656, - "conseil.": 657, - "déjà": 658, - "g.": 659, - "l’autre": 660, - "peut,": 661, - "concernant": 662, - "alinéa": 663, - "n’a": 664, - "indique": 665, - "autorité": 666, - "utilisée": 667, - "tarifs": 668, - "servitude": 669, - "garage": 670, - "2021": 671, - "intégrante": 672, - "naturels": 673, - "lac,": 674, - "l’entente": 675, - "récurrence": 676, - "d’eau,": 677, - "durable.": 678, - "située": 679, - "cités": 680, - "prévaut.": 681, - "pourrait": 682, - "transmise": 683, - "autorisée": 684, - "ou,": 685, - "exemplaire": 686, - "assujettie": 687, - "prendre": 688, - "d’approbation": 689, - "d’égout": 690, - "l’ajout": 691, - "québec.": 692, - "tels": 693, - "respect": 694, - "matière": 695, - "l'emprise": 696, - "configuration": 697, - "finale": 698, - "accessoire": 699, - "réparation": 700, - "permettent": 701, - "ans": 702, - "15.": 703, - "ski": 704, - "lorsqu’une": 705, - "ouverture": 706, - "privée": 707, - "lotissement,": 708, - "catégorie": 709, - "32": 710, - "41": 711, - "cases": 712, - "façade": 713, - "1314-2021-c": 714, - "donné": 715, - "l'immeuble": 716, - "bâti": 717, - "chargement": 718, - "mars": 719, - "différentes": 720, - "ci-après": 721, - "raison": 722, - "calcul": 723, - "ingénieurs": 724, - "2)": 725, - "émis": 726, - "sentier": 727, - "mode": 728, - "constructions,": 729, - "compte": 730, - "rénovation": 731, - "amende": 732, - "contre": 733, - "minimum": 734, - "infractions": 735, - "l’interprétation": 736, - "16.": 737, - "l’utilisation": 738, - "naturelle": 739, - "pénalités": 740, - "font": 741, - "dérogations": 742, - "inclut": 743, - "l’exception": 744, - "publiques": 745, - "25": 746, - "l’ouverture": 747, - "localisé": 748, - "rue,": 749, - "nécessitant": 750, - "allée": 751, - "nom,": 752, - "biologiste": 753, - "gestion": 754, - "crue": 755, - "architectural": 756, - "maire": 757, - "transmission": 758, - "déposé": 759, - "déposée": 760, - "photos": 761, - "stationnement,": 762, - "aires": 763, - "pendant": 764, - "adopté": 765, - "déplacement": 766, - "et,": 767, - "échelle": 768, - "destiné": 769, - "1)": 770, - "organisme": 771, - "jusqu’à": 772, - "28": 773, - "occupant": 774, - "d’application": 775, - "données": 776, - "doit,": 777, - "plans,": 778, - "sections": 779, - "respecte": 780, - "somme": 781, - "ce,": 782, - "but": 783, - "responsabilité": 784, - "générale": 785, - "17.": 786, - "moyenne": 787, - "abrogation": 788, - "on": 789, - "supérieure": 790, - "mineures": 791, - "portée": 792, - "..............................................................................................................": 793, - "19.": 794, - "pluriel": 795, - "pouvoir": 796, - "jeux": 797, - "signée": 798, - "arpenteur-géomètre": 799, - "copropriété": 800, - "cout": 801, - "captage": 802, - "l’érosion": 803, - "principal,": 804, - "professionnels": 805, - "l’acceptation": 806, - "...........................................................................................................................................................................": 807, - "exceptions": 808, - "ouvertures,": 809, - "présents": 810, - "contexte": 811, - "établissant": 812, - "demandes": 813, - "condition": 814, - "écrit": 815, - "motifs": 816, - "exiger": 817, - "telle": 818, - "proposé": 819, - "conseillère": 820, - "suit": 821, - "h.": 822, - "pentes": 823, - "attestation": 824, - "dernier": 825, - "charge": 826, - "page": 827, - "l’extérieur": 828, - "usées": 829, - "validité": 830, - "l’occupation": 831, - "26": 832, - "contravention": 833, - "occupé": 834, - "utilisés": 835, - "temps": 836, - "opérations": 837, - "b": 838, - "paragraphes": 839, - "applicables.": 840, - "municipalité": 841, - "l’affichage": 842, - "18.": 843, - "__________________________": 844, - "rémunération": 845, - "base": 846, - "l’exécutant": 847, - "chapitres,": 848, - "attribués": 849, - "parcs,": 850, - "60": 851, - "vise": 852, - "cadastrale.": 853, - "36": 854, - "papier": 855, - "57": 856, - "projetés;": 857, - "plancher": 858, - "pose": 859, - "centre-ville": 860 - }, - "fontname": { - "": 0, - "ArialNarrow": 1, - "Arial": 2, - "ArialMT": 3, - "ArialNarrow-Bold": 4, - "F1": 5, - "F2": 6, - "Helvetica": 7, - "Verdana": 8, - "Arial-BoldMT": 9, - "Cambria": 10, - "F8": 11, - "Arial,Bold": 12, - "Arial-ItalicMT": 13, - "Helvetica-Bold": 14, - "F3": 15, - "ArialNarrow-Italic": 16, - "Calibri": 17, - "Arial,Italic": 18, - "Verdana,Bold": 19, - "Times-Bold": 20, - "Lato": 21, - "Verdana-Bold": 22, - "ArialNarrow-BoldItalic": 23, - "Times-Roman": 24, - "F6": 25, - "Helvetica-Oblique": 26, - "Verdana,Italic": 27, - "TimesNewRomanPSMT": 28 - }, - "rgb": { - "": 0, - "#000": 1, - "#222": 2, - "#777": 3, - "#378": 4, - "#444": 5, - "#f00": 6, - "#555": 7, - "#333": 8 - }, - "mctag": { - "": 0, - "P": 1, - "Span": 2, - "Artifact": 3, - "Div": 4, - "Suspect": 5 - }, - "element": { - "": 0, - "H6": 1, - "H5": 2, - "LBody": 3, - "P": 4, - "H1": 5, - "Span": 6, - "H4": 7, - "H3": 8, - "H2": 9, - "TOCI": 10, - "Link": 11, - "Lbl": 12, - "Figure": 13 - }, - "x0": { - "": 0, - "13": 1, - "16": 2, - "14": 3, - "11": 4, - "17": 5, - "20": 6, - "22": 7, - "21": 8, - "19": 9, - "18": 10, - "25": 11, - "8": 12, - "23": 13, - "24": 14, - "27": 15, - "28": 16, - "29": 17, - "31": 18, - "26": 19, - "15": 20, - "30": 21, - "32": 22, - "33": 23, - "38": 24, - "34": 25, - "35": 26, - "39": 27, - "37": 28, - "36": 29, - "40": 30, - "41": 31, - "43": 32, - "42": 33, - "48": 34, - "12": 35, - "45": 36, - "47": 37, - "44": 38, - "51": 39, - "46": 40, - "49": 41, - "10": 42, - "50": 43, - "52": 44, - "7": 45, - "9": 46, - "53": 47, - "6": 48, - "54": 49, - "63": 50, - "69": 51, - "55": 52, - "57": 53, - "59": 54, - "56": 55, - "62": 56, - "58": 57, - "60": 58, - "64": 59, - "61": 60, - "66": 61, - "65": 62, - "70": 63, - "68": 64, - "67": 65 - }, - "x1": { - "": 0, - "52": 1, - "17": 2, - "15": 3, - "20": 4, - "22": 5, - "19": 6, - "21": 7, - "16": 8, - "25": 9, - "26": 10, - "24": 11, - "23": 12, - "18": 13, - "27": 14, - "28": 15, - "31": 16, - "14": 17, - "29": 18, - "38": 19, - "30": 20, - "12": 21, - "33": 22, - "32": 23, - "34": 24, - "37": 25, - "35": 26, - "36": 27, - "39": 28, - "40": 29, - "41": 30, - "42": 31, - "48": 32, - "43": 33, - "44": 34, - "51": 35, - "46": 36, - "47": 37, - "13": 38, - "45": 39, - "50": 40, - "49": 41, - "10": 42, - "11": 43, - "54": 44, - "70": 45, - "9": 46, - "53": 47, - "64": 48, - "8": 49, - "63": 50, - "69": 51, - "55": 52, - "58": 53, - "56": 54, - "57": 55, - "59": 56, - "60": 57, - "62": 58, - "61": 59, - "7": 60, - "65": 61, - "66": 62, - "67": 63, - "68": 64 - }, - "top": { - "": 0, - "4": 1, - "21": 2, - "5": 3, - "20": 4, - "44": 5, - "43": 6, - "45": 7, - "15": 8, - "26": 9, - "19": 10, - "18": 11, - "38": 12, - "25": 13, - "50": 14, - "35": 15, - "49": 16, - "33": 17, - "52": 18, - "42": 19, - "58": 20, - "59": 21, - "27": 22, - "22": 23, - "57": 24, - "30": 25, - "53": 26, - "28": 27, - "29": 28, - "34": 29, - "36": 30, - "14": 31, - "41": 32, - "40": 33, - "48": 34, - "55": 35, - "60": 36, - "56": 37, - "17": 38, - "23": 39, - "51": 40, - "37": 41, - "72": 42, - "67": 43, - "54": 44, - "69": 45, - "73": 46, - "39": 47, - "12": 48, - "62": 49, - "46": 50, - "13": 51, - "75": 52, - "61": 53, - "76": 54, - "74": 55, - "68": 56, - "16": 57, - "71": 58, - "83": 59, - "47": 60, - "24": 61, - "70": 62, - "32": 63, - "78": 64, - "66": 65, - "64": 66, - "11": 67, - "31": 68, - "65": 69, - "63": 70, - "84": 71, - "81": 72, - "7": 73, - "88": 74, - "10": 75, - "8": 76, - "77": 77, - "85": 78, - "79": 79, - "86": 80, - "82": 81, - "9": 82, - "87": 83, - "80": 84, - "89": 85, - "90": 86, - "91": 87, - "95": 88, - "96": 89, - "94": 90, - "6": 91, - "92": 92, - "97": 93 - }, - "bottom": { - "": 0, - "4": 1, - "6": 2, - "21": 3, - "22": 4, - "45": 5, - "44": 6, - "16": 7, - "19": 8, - "30": 9, - "50": 10, - "53": 11, - "27": 12, - "58": 13, - "46": 14, - "35": 15, - "20": 16, - "34": 17, - "43": 18, - "39": 19, - "23": 20, - "60": 21, - "36": 22, - "31": 23, - "38": 24, - "56": 25, - "15": 26, - "28": 27, - "59": 28, - "62": 29, - "52": 30, - "40": 31, - "42": 32, - "25": 33, - "26": 34, - "73": 35, - "61": 36, - "51": 37, - "57": 38, - "54": 39, - "37": 40, - "49": 41, - "24": 42, - "13": 43, - "17": 44, - "75": 45, - "68": 46, - "29": 47, - "70": 48, - "47": 49, - "41": 50, - "55": 51, - "77": 52, - "72": 53, - "69": 54, - "48": 55, - "14": 56, - "65": 57, - "74": 58, - "71": 59, - "18": 60, - "67": 61, - "84": 62, - "63": 63, - "76": 64, - "86": 65, - "8": 66, - "11": 67, - "79": 68, - "33": 69, - "32": 70, - "82": 71, - "78": 72, - "66": 73, - "64": 74, - "12": 75, - "80": 76, - "83": 77, - "10": 78, - "89": 79, - "88": 80, - "85": 81, - "87": 82, - "9": 83, - "81": 84, - "90": 85, - "91": 86, - "92": 87, - "96": 88, - "97": 89, - "95": 90, - "5": 91, - "93": 92, - "7": 93, - "98": 94 - }, - "x0:delta": { - "": 0, - "0": 1, - "1": 2, - "-4": 3, - "-2": 4, - "-3": 5, - "-1": 6, - "-6": 7, - "2": 8, - "3": 9, - "-5": 10, - "4": 11, - "5": 12 - }, - "x1:delta": { - "": 0, - "0": 1, - "1": 2, - "-4": 3, - "-3": 4, - "-2": 5, - "-1": 6, - "-6": 7, - "2": 8, - "3": 9, - "-5": 10, - "4": 11, - "5": 12 - }, - "top:delta": { - "": 0, - "0": 1, - "1": 2, - "4": 3, - "2": 4, - "5": 5, - "3": 6, - "6": 7, - "9": 8, - "8": 9 - }, - "bottom:delta": { - "": 0, - "0": 1, - "1": 2, - "4": 3, - "2": 4, - "5": 5, - "3": 6, - "6": 7, - "9": 8 - }, - "x0:delta:delta": { - "": 0, - "0": 1, - "-1": 2, - "1": 3 - }, - "x1:delta:delta": { - "": 0, - "0": 1, - "1": 2, - "-1": 3 - }, - "top:delta:delta": { - "": 0, - "0": 1, - "1": 2, - "-1": 3 - }, - "bottom:delta:delta": { - "": 0, - "0": 1, - "1": 2, - "-1": 3 - }, - "line:indent": { - "": 0, - "0": 1, - "-2": 2, - "2": 3, - "3": 4, - "-3": 5, - "4": 6, - "-5": 7, - "-4": 8, - "1": 9, - "-6": 10, - "-7": 11, - "7": 12, - "5": 13, - "6": 14, - "-9": 15, - "11": 16, - "-18": 17, - "9": 18, - "-11": 19, - "-17": 20, - "18": 21, - "-1": 22, - "13": 23, - "-15": 24, - "-14": 25, - "-8": 26, - "-10": 27, - "-19": 28, - "-28": 29, - "-22": 30, - "-21": 31, - "-12": 32, - "-16": 33, - "12": 34, - "8": 35, - "-24": 36, - "-20": 37, - "-30": 38 - }, - "line:gap": { - "": 0, - "0": 1, - "1": 2, - "2": 3, - "4": 4, - "3": 5, - "5": 6, - "7": 7, - "8": 8, - "62": 9, - "6": 10, - "44": 11, - "10": 12, - "14": 13, - "18": 14, - "20": 15, - "15": 16, - "11": 17, - "-1": 18, - "42": 19, - "24": 20, - "16": 21, - "31": 22, - "59": 23, - "-26": 24, - "19": 25, - "46": 26, - "63": 27, - "61": 28, - "70": 29, - "22": 30, - "-4": 31, - "34": 32, - "17": 33, - "9": 34, - "67": 35, - "29": 36, - "64": 37, - "75": 38, - "28": 39, - "13": 40, - "45": 41, - "60": 42, - "26": 43, - "51": 44, - "47": 45, - "12": 46, - "-2": 47, - "32": 48, - "43": 49, - "23": 50 - }, - "line:height": { - "": 0, - "1": 1, - "2": 2, - "3": 3 - } - }, - "id2label": [ - "O", - "I-Titre", - "I-Tete", - "I-TOC", - "I-SousSection", - "I-Section", - "I-Pied", - "I-Liste", - "I-Figure", - "I-Chapitre", - "I-Article", - "I-Annexe", - "I-Alinea", - "B-Titre", - "B-Tete", - "B-TOC", - "B-SousSection", - "B-Section", - "B-Pied", - "B-Liste", - "B-Figure", - "B-Chapitre", - "B-Article", - "B-Annexe", - "B-Alinea" - ], - "featdims": { - "lower": 32, - "fontname": 8, - "rgb": 8, - "mctag": 8, - "element": 8, - "x0": 8, - "x1": 8, - "top": 8, - "bottom": 8, - "x0:delta": 8, - "x1:delta": 8, - "top:delta": 8, - "bottom:delta": 8, - "x0:delta:delta": 8, - "x1:delta:delta": 8, - "top:delta:delta": 8, - "bottom:delta:delta": 8, - "line:indent": 8, - "line:gap": 8, - "line:height": 8 - }, - "veclen": 11, - "label_weights": [ - 0.005208333333333333, - 0.00130718954248366, - 0.000259000259000259, - 0.00016630633627141194, - 0.0027472527472527475, - 0.002331002331002331, - 0.002336448598130841, - 2.8462458017874424e-05, - 0.034482758620689655, - 0.0033783783783783786, - 0.00031486146095717883, - 0.014705882352941176, - 2.2560122727067635e-05, - 0.007575757575757576, - 0.0036900369003690036, - 0.043478260869565216, - 0.0196078431372549, - 0.011494252873563218, - 0.0035211267605633804, - 0.0006199628022318661, - 0.3333333333333333, - 0.023255813953488372, - 0.0014367816091954023, - 0.125, - 0.000700770847932726 - ], - "hidden_size": 80, - "labels": "literal" -} \ No newline at end of file diff --git a/alexi/models/rnn.pt b/alexi/models/rnn.pt deleted file mode 100644 index 2411096..0000000 Binary files a/alexi/models/rnn.pt and /dev/null differ diff --git a/alexi/models/rnn_crf.json b/alexi/models/rnn_crf.json deleted file mode 100644 index f68fa19..0000000 --- a/alexi/models/rnn_crf.json +++ /dev/null @@ -1,1518 +0,0 @@ -{ - "feat2id": { - "lower": { - "": 0, - "de": 1, - "la": 2, - "et": 3, - "le": 4, - "les": 5, - "à": 6, - "du": 7, - "ou": 8, - "des": 9, - "en": 10, - "un": 11, - "règlement": 12, - "une": 13, - "par": 14, - "pour": 15, - "au": 16, - "sur": 17, - "d’un": 18, - "aux": 19, - "dans": 20, - ":": 21, - "est": 22, - "permis": 23, - "présent": 24, - "dispositions": 25, - "que": 26, - "d’une": 27, - "être": 28, - "demande": 29, - "construction": 30, - "travaux": 31, - "doit": 32, - "–": 33, - "chapitre": 34, - "cas": 35, - "sont": 36, - "critères": 37, - "ville": 38, - "qui": 39, - "terrain": 40, - "tout": 41, - "objectifs": 42, - "relatifs": 43, - "toute": 44, - "plan": 45, - "pas": 46, - "numéro": 47, - "ne": 48, - "a": 49, - "projet": 50, - "bâtiment": 51, - "1": 52, - "«": 53, - "documents": 54, - "certificat": 55, - "2": 56, - "vigueur": 57, - "1.": 58, - "ce": 59, - "si": 60, - "lotissement": 61, - "2.": 62, - "autre": 63, - "section": 64, - ";": 65, - "peut": 66, - "conseil": 67, - "modification": 68, - "certificats": 69, - "lot": 70, - "fonctionnaire": 71, - "3.": 72, - "»": 73, - "cette": 74, - "plus": 75, - "avec": 76, - "relatif": 77, - "relatives": 78, - "vertu": 79, - "loi": 80, - "d'un": 81, - "requis": 82, - "entre": 83, - "suivants": 84, - "plans": 85, - "désigné": 86, - "partie": 87, - "cadastrale": 88, - "3": 89, - "doivent": 90, - "l’aménagement": 91, - "été": 92, - "4.": 93, - "rue": 94, - "domaine": 95, - "bâtiments": 96, - "renseignements": 97, - "ainsi": 98, - "tableau": 99, - "article": 100, - "d'une": 101, - "lorsque": 102, - "d’évaluation": 103, - "où": 104, - "personne": 105, - "zonage": 106, - "5.": 107, - "tous": 108, - "leur": 109, - "4": 110, - "règlements": 111, - "requérant": 112, - "architecturale": 113, - "c.": 114, - "conformément": 115, - "terrains": 116, - "ces": 117, - "y": 118, - "a.": 119, - "d’autorisation": 120, - "superficie": 121, - "public": 122, - "autorisation": 123, - "1314-2021-piia": 124, - "rues": 125, - "piia": 126, - "5": 127, - "texte": 128, - "conditions": 129, - "eaux": 130, - "dimensions": 131, - "type": 132, - "6": 133, - "non": 134, - "suivantes": 135, - "territoire": 136, - "disposition": 137, - "il": 138, - "localisation": 139, - "même": 140, - "fait": 141, - "principal": 142, - "comité": 143, - "construction,": 144, - "moins": 145, - "résolution": 146, - "sous-section": 147, - "6.": 148, - "règlement,": 149, - "son": 150, - "mètres": 151, - "faire": 152, - "sainte-adèle": 153, - "règlement.": 154, - "dont": 155, - "visés": 156, - "selon": 157, - "10": 158, - "sa": 159, - "soit": 160, - "fins": 161, - "opération": 162, - "d’implantation": 163, - "interprétatives": 164, - "comme": 165, - "ses": 166, - "administratives": 167, - "lots": 168, - "publique": 169, - "1314-2021-pc": 170, - "sainte-adèle.": 171, - "échéant,": 172, - "cours": 173, - "titre": 174, - "projets": 175, - "toutes": 176, - "spécifiques": 177, - "milieux": 178, - "se": 179, - "l’objet": 180, - "-": 181, - "visé": 182, - "circulation": 183, - "l’emprise": 184, - "s’appliquent": 185, - "copie": 186, - "conforme": 187, - "développement": 188, - "b.": 189, - "document": 190, - "l’ensemble": 191, - "d’urbanisme": 192, - "cadre": 193, - "dérogation": 194, - "l’implantation": 195, - "zones": 196, - "d'occupation": 197, - "milieu": 198, - "incluant": 199, - "municipaux": 200, - "déclaratoires,": 201, - "respecter": 202, - "7.": 203, - "voie": 204, - "sous": 205, - "propriétaire": 206, - "types": 207, - "sans": 208, - "interventions": 209, - "rapport": 210, - "objectif": 211, - "délivrance": 212, - "n’est": 213, - "d’eau": 214, - "municipal": 215, - "district": 216, - "qu’il": 217, - "services": 218, - "chaque": 219, - "parc": 220, - "l’intérieur": 221, - "additionnels": 222, - "7": 223, - "droits": 224, - "permettre": 225, - "promoteur": 226, - "situé": 227, - "cession": 228, - "l’article": 229, - "peuvent": 230, - "générales": 231, - "d’intégration": 232, - "sol": 233, - "autres": 234, - "8": 235, - "conformité": 236, - "assujettis": 237, - "ouvrages": 238, - "présente": 239, - "avant": 240, - "15": 241, - "zone": 242, - "immeuble": 243, - "ayant": 244, - "signé": 245, - "s’applique": 246, - "remplacement": 247, - "terrain,": 248, - "espaces": 249, - "20": 250, - "9": 251, - "montrant": 252, - "constructions": 253, - "8.": 254, - "ligne": 255, - "lors": 256, - "limites": 257, - "suivant": 258, - "30": 259, - "usage": 260, - "ouvrage": 261, - "applicables": 262, - "s’il": 263, - "contradiction": 264, - "sens": 265, - "nouvelle": 266, - "réseau": 267, - "distance": 268, - "voies": 269, - "également": 270, - "(s)": 271, - "projetée": 272, - "associés": 273, - "règles": 274, - "date": 275, - "leurs": 276, - "frais": 277, - "système": 278, - "stationnement": 279, - "l’usage": 280, - "d’interventions": 281, - "sécurité": 282, - "lui": 283, - "entrée": 284, - "infrastructures": 285, - "plus,": 286, - "audrey": 287, - "usages": 288, - "table": 289, - "l’intégration": 290, - "matériaux": 291, - "conseiller": 292, - "l'autorisation": 293, - "d'autorisation": 294, - "(l.r.q.,": 295, - "attendu": 296, - "niveau": 297, - "9.": 298, - "garantie": 299, - "nadine": 300, - "intégré": 301, - "durable": 302, - "lieu": 303, - "monsieur": 304, - "11": 305, - "12": 306, - "informations": 307, - "propriété": 308, - "l’application": 309, - "effet": 310, - "ville,": 311, - "droit": 312, - "jours": 313, - "tel": 314, - "consultatif": 315, - "17": 316, - "10.": 317, - "l’opération": 318, - "21": 319, - "d’aménagement": 320, - "senécal": 321, - "durée": 322, - "structure": 323, - "propriétaire,": 324, - "d’affaires": 325, - "fiche": 326, - "l’emplacement": 327, - "d.": 328, - "membre": 329, - "suite": 330, - "l’approbation": 331, - "mineure": 332, - "deux": 333, - "l’autorité": 334, - "mrc": 335, - "ingénieur": 336, - "plusieurs": 337, - "vigueur.": 338, - "cet": 339, - "paragraphe": 340, - "valeur": 341, - "qualité": 342, - "l'occupant": 343, - "mesures": 344, - "compétente": 345, - "agrandissement": 346, - "québec": 347, - "elle": 348, - "100": 349, - "protection": 350, - "nécessaires": 351, - "réalisation": 352, - "aucun": 353, - "avoir": 354, - "personnes": 355, - "d’accès": 356, - "visant": 357, - "permettant": 358, - "19": 359, - "fin": 360, - "projeté": 361, - "pourvu": 362, - "territoires": 363, - "sainte-adèle,": 364, - "ont": 365, - "professionnel": 366, - "tracé": 367, - "%": 368, - "ni": 369, - "brière": 370, - "analyse": 371, - "pente": 372, - "vigueur;": 373, - "format": 374, - "d’évaluer": 375, - "laquelle": 376, - "exigences": 377, - "contribution": 378, - "allées": 379, - "éléments": 380, - "travaux,": 381, - "acquis": 382, - "infraction": 383, - "annexe": 384, - "électronique": 385, - "membres": 386, - "délai": 387, - "description": 388, - "limite": 389, - "démolition": 390, - "avis": 391, - "aucune": 392, - "requise": 393, - "cas,": 394, - "séance": 395, - "ordinaire": 396, - "forme": 397, - "faisant": 398, - "devoirs": 399, - "accompagnée": 400, - "demande.": 401, - "contrôle": 402, - "aménagement": 403, - "sera": 404, - "l’identification": 405, - "nature": 406, - "soient": 407, - "afin": 408, - "ville.": 409, - "comprend": 410, - "majeur": 411, - "50": 412, - "devis": 413, - "d’intérêt": 414, - "prévus": 415, - "site": 416, - "avril": 417, - "interprétation": 418, - "sauf": 419, - "normes": 420, - "existante": 421, - "minimale": 422, - "pays-d’en-haut.": 423, - "linéaire": 424, - "assujetties": 425, - "chapitre,": 426, - "pouvoirs": 427, - "prévues": 428, - "exigés": 429, - "fournir": 430, - "mairesse": 431, - "modifications": 432, - "lequel": 433, - "l’environnement": 434, - "traitement": 435, - "applicable": 436, - "service": 437, - "public,": 438, - "code": 439, - "000": 440, - "ministère": 441, - "tableaux": 442, - "largeur": 443, - "humide": 444, - "règlementation": 445, - "l’échelle": 446, - "patrimonial": 447, - "l'occupation": 448, - "11.": 449, - "montant": 450, - "accès": 451, - "nombre": 452, - "qu’une": 453, - "e.": 454, - "existants": 455, - "contenu": 456, - "jour": 457, - "mois": 458, - "alors": 459, - "période": 460, - "règlement;": 461, - "espace": 462, - "fonction": 463, - "indiquant": 464, - "ententes": 465, - "l’adresse": 466, - "tenue": 467, - "lecture": 468, - "qu'il": 469, - "a)": 470, - "nom": 471, - "paiement": 472, - "mise": 473, - "l’immeuble": 474, - "1314-2021-l": 475, - "pdf": 476, - "relative": 477, - "s'il": 478, - "13": 479, - "notamment": 480, - "équipements": 481, - "signature": 482, - "obligation": 483, - "l’agrandissement": 484, - "changement": 485, - "lieux": 486, - "seul": 487, - "figures": 488, - "d’aqueduc": 489, - "lac": 490, - "suffisante": 491, - "toit": 492, - "maison": 493, - "b)": 494, - "14": 495, - "surface": 496, - "projetés": 497, - "logements": 498, - "16": 499, - "dépôt": 500, - "18": 501, - "loi.": 502, - "d’ensemble": 503, - "nord": 504, - "compréhension": 505, - "12.": 506, - "l’ordre": 507, - "l’accès": 508, - "vente": 509, - "état": 510, - "train": 511, - "bâtiment,": 512, - "l’installation": 513, - "l’entretien": 514, - "assujetti": 515, - "comprenant": 516, - "existantes": 517, - "après": 518, - "caractérisation": 519, - "délais": 520, - "articles": 521, - "me": 522, - "l’avis": 523, - "parties": 524, - "»,": 525, - "certains": 526, - "permis,": 527, - "nouveau": 528, - "constitue": 529, - "».": 530, - "exécutés": 531, - "accordée": 532, - "réserve": 533, - "tolérance": 534, - "l’annexe": 535, - "p’tit": 536, - "hautes": 537, - "extérieur": 538, - "particulières": 539, - "compris": 540, - "architecturaux": 541, - "étude": 542, - "lieu;": 543, - "minimales": 544, - "décision": 545, - "échéant;": 546, - "celui-ci": 547, - "moment": 548, - "l’égard": 549, - "directeur": 550, - "lois": 551, - "matières,": 552, - "l’expression": 553, - "préparé": 554, - "généraux": 555, - "dérogatoire": 556, - "cadastre": 557, - "techniques": 558, - "l’autorisation": 559, - "murs": 560, - "secteur": 561, - "terrain;": 562, - "conformes": 563, - "qu’un": 564, - "1314-2021-z": 565, - "13.": 566, - "autorisés": 567, - "élément": 568, - "lignes": 569, - "physique": 570, - "24": 571, - "intitulé": 572, - "l’emploi": 573, - "entente": 574, - "additionnel": 575, - "sommet": 576, - "c)": 577, - "drainage": 578, - "composantes": 579, - "prolongement": 580, - "présentation": 581, - "caractéristiques": 582, - "f.": 583, - "greffière": 584, - "existant": 585, - "référence": 586, - "réalisés": 587, - "civil": 588, - "modalités": 589, - "défaut": 590, - "temporaire": 591, - "bordure": 592, - "tarif": 593, - "vigueur,": 594, - "l’une": 595, - "atteints": 596, - "22": 597, - "auquel": 598, - "coupe": 599, - "stationnements": 600, - "versant": 601, - "toiture": 602, - "madame": 603, - "formant": 604, - "greffier": 605, - "motion": 606, - "hauteur": 607, - "remise": 608, - "immeubles": 609, - "l’urbanisme": 610, - "habitation": 611, - "manière": 612, - "morale": 613, - "mesure": 614, - "aménagements": 615, - "quiconque": 616, - "ceux": 617, - "l’occupant,": 618, - "responsable": 619, - "terme": 620, - "l’entrée": 621, - "adjacent": 622, - "arrière": 623, - "1314-2021-tm": 624, - "mot": 625, - "objet": 626, - "autorisé": 627, - "déclaration": 628, - "demande;": 629, - "naturel": 630, - "réglementation": 631, - "14.": 632, - "nécessaire": 633, - "fois": 634, - "directrice": 635, - "juridiques": 636, - "d’occupation": 637, - "logement": 638, - "place": 639, - "partir": 640, - "$": 641, - "d’autres": 642, - "hors": 643, - "accompagner": 644, - "matières": 645, - "prévaut;": 646, - "23": 647, - "couleur,": 648, - "municipal,": 649, - "terminologie": 650, - "n'est": 651, - "application": 652, - "recours": 653, - "celles": 654, - "approuvé": 655, - "demande,": 656, - "conseil.": 657, - "déjà": 658, - "g.": 659, - "l’autre": 660, - "peut,": 661, - "concernant": 662, - "alinéa": 663, - "n’a": 664, - "indique": 665, - "autorité": 666, - "utilisée": 667, - "tarifs": 668, - "servitude": 669, - "garage": 670, - "2021": 671, - "intégrante": 672, - "naturels": 673, - "lac,": 674, - "l’entente": 675, - "récurrence": 676, - "d’eau,": 677, - "durable.": 678, - "située": 679, - "cités": 680, - "prévaut.": 681, - "pourrait": 682, - "transmise": 683, - "autorisée": 684, - "ou,": 685, - "exemplaire": 686, - "assujettie": 687, - "prendre": 688, - "d’approbation": 689, - "d’égout": 690, - "l’ajout": 691, - "québec.": 692, - "tels": 693, - "respect": 694, - "matière": 695, - "l'emprise": 696, - "configuration": 697, - "finale": 698, - "accessoire": 699, - "réparation": 700, - "permettent": 701, - "ans": 702, - "15.": 703, - "ski": 704, - "lorsqu’une": 705, - "ouverture": 706, - "privée": 707, - "lotissement,": 708, - "catégorie": 709, - "32": 710, - "41": 711, - "cases": 712, - "façade": 713, - "1314-2021-c": 714, - "donné": 715, - "l'immeuble": 716, - "bâti": 717, - "chargement": 718, - "mars": 719, - "différentes": 720, - "ci-après": 721, - "raison": 722, - "calcul": 723, - "ingénieurs": 724, - "2)": 725, - "émis": 726, - "sentier": 727, - "mode": 728, - "constructions,": 729, - "compte": 730, - "rénovation": 731, - "amende": 732, - "contre": 733, - "minimum": 734, - "infractions": 735, - "l’interprétation": 736, - "16.": 737, - "l’utilisation": 738, - "naturelle": 739, - "pénalités": 740, - "font": 741, - "dérogations": 742, - "inclut": 743, - "l’exception": 744, - "publiques": 745, - "25": 746, - "l’ouverture": 747, - "localisé": 748, - "rue,": 749, - "nécessitant": 750, - "allée": 751, - "nom,": 752, - "biologiste": 753, - "gestion": 754, - "crue": 755, - "architectural": 756, - "maire": 757, - "transmission": 758, - "déposé": 759, - "déposée": 760, - "photos": 761, - "stationnement,": 762, - "aires": 763, - "pendant": 764, - "adopté": 765, - "déplacement": 766, - "et,": 767, - "échelle": 768, - "destiné": 769, - "1)": 770, - "organisme": 771, - "jusqu’à": 772, - "28": 773, - "occupant": 774, - "d’application": 775, - "données": 776, - "doit,": 777, - "plans,": 778, - "sections": 779, - "respecte": 780, - "somme": 781, - "ce,": 782, - "but": 783, - "responsabilité": 784, - "générale": 785, - "17.": 786, - "moyenne": 787, - "abrogation": 788, - "on": 789, - "supérieure": 790, - "mineures": 791, - "portée": 792, - "..............................................................................................................": 793, - "19.": 794, - "pluriel": 795, - "pouvoir": 796, - "jeux": 797, - "signée": 798, - "arpenteur-géomètre": 799, - "copropriété": 800, - "cout": 801, - "captage": 802, - "l’érosion": 803, - "principal,": 804, - "professionnels": 805, - "l’acceptation": 806, - "...........................................................................................................................................................................": 807, - "exceptions": 808, - "ouvertures,": 809, - "présents": 810, - "contexte": 811, - "établissant": 812, - "demandes": 813, - "condition": 814, - "écrit": 815, - "motifs": 816, - "exiger": 817, - "telle": 818, - "proposé": 819, - "conseillère": 820, - "suit": 821, - "h.": 822, - "pentes": 823, - "attestation": 824, - "dernier": 825, - "charge": 826, - "page": 827, - "l’extérieur": 828, - "usées": 829, - "validité": 830, - "l’occupation": 831, - "26": 832, - "contravention": 833, - "occupé": 834, - "utilisés": 835, - "temps": 836, - "opérations": 837, - "b": 838, - "paragraphes": 839, - "applicables.": 840, - "municipalité": 841, - "l’affichage": 842, - "18.": 843, - "__________________________": 844, - "rémunération": 845, - "base": 846, - "l’exécutant": 847, - "chapitres,": 848, - "attribués": 849, - "parcs,": 850, - "60": 851, - "vise": 852, - "cadastrale.": 853, - "36": 854, - "papier": 855, - "57": 856, - "projetés;": 857, - "plancher": 858, - "pose": 859, - "centre-ville": 860 - }, - "fontname": { - "": 0, - "ArialNarrow": 1, - "Arial": 2, - "ArialMT": 3, - "ArialNarrow-Bold": 4, - "F1": 5, - "F2": 6, - "Helvetica": 7, - "Verdana": 8, - "Arial-BoldMT": 9, - "Cambria": 10, - "F8": 11, - "Arial,Bold": 12, - "Arial-ItalicMT": 13, - "Helvetica-Bold": 14, - "F3": 15, - "ArialNarrow-Italic": 16, - "Calibri": 17, - "Arial,Italic": 18, - "Verdana,Bold": 19, - "Times-Bold": 20, - "Lato": 21, - "Verdana-Bold": 22, - "ArialNarrow-BoldItalic": 23, - "Times-Roman": 24, - "F6": 25, - "Helvetica-Oblique": 26, - "Verdana,Italic": 27, - "TimesNewRomanPSMT": 28 - }, - "rgb": { - "": 0, - "#000": 1, - "#222": 2, - "#777": 3, - "#378": 4, - "#444": 5, - "#f00": 6, - "#555": 7, - "#333": 8 - }, - "mctag": { - "": 0, - "P": 1, - "Span": 2, - "Artifact": 3, - "Div": 4, - "Suspect": 5 - }, - "element": { - "": 0, - "H6": 1, - "H5": 2, - "LBody": 3, - "P": 4, - "H1": 5, - "Span": 6, - "H4": 7, - "H3": 8, - "H2": 9, - "TOCI": 10, - "Link": 11, - "Lbl": 12, - "Figure": 13 - }, - "x0": { - "": 0, - "13": 1, - "16": 2, - "14": 3, - "11": 4, - "17": 5, - "20": 6, - "22": 7, - "21": 8, - "19": 9, - "18": 10, - "25": 11, - "8": 12, - "23": 13, - "24": 14, - "27": 15, - "28": 16, - "29": 17, - "31": 18, - "26": 19, - "15": 20, - "30": 21, - "32": 22, - "33": 23, - "38": 24, - "34": 25, - "35": 26, - "39": 27, - "37": 28, - "36": 29, - "40": 30, - "41": 31, - "43": 32, - "42": 33, - "48": 34, - "12": 35, - "45": 36, - "47": 37, - "44": 38, - "51": 39, - "46": 40, - "49": 41, - "10": 42, - "50": 43, - "52": 44, - "7": 45, - "9": 46, - "53": 47, - "6": 48, - "54": 49, - "63": 50, - "69": 51, - "55": 52, - "57": 53, - "59": 54, - "56": 55, - "62": 56, - "58": 57, - "60": 58, - "64": 59, - "61": 60, - "66": 61, - "65": 62, - "70": 63, - "68": 64, - "67": 65 - }, - "x1": { - "": 0, - "52": 1, - "17": 2, - "15": 3, - "20": 4, - "22": 5, - "19": 6, - "21": 7, - "16": 8, - "25": 9, - "26": 10, - "24": 11, - "23": 12, - "18": 13, - "27": 14, - "28": 15, - "31": 16, - "14": 17, - "29": 18, - "38": 19, - "30": 20, - "12": 21, - "33": 22, - "32": 23, - "34": 24, - "37": 25, - "35": 26, - "36": 27, - "39": 28, - "40": 29, - "41": 30, - "42": 31, - "48": 32, - "43": 33, - "44": 34, - "51": 35, - "46": 36, - "47": 37, - "13": 38, - "45": 39, - "50": 40, - "49": 41, - "10": 42, - "11": 43, - "54": 44, - "70": 45, - "9": 46, - "53": 47, - "64": 48, - "8": 49, - "63": 50, - "69": 51, - "55": 52, - "58": 53, - "56": 54, - "57": 55, - "59": 56, - "60": 57, - "62": 58, - "61": 59, - "7": 60, - "65": 61, - "66": 62, - "67": 63, - "68": 64 - }, - "top": { - "": 0, - "4": 1, - "21": 2, - "5": 3, - "20": 4, - "44": 5, - "43": 6, - "45": 7, - "15": 8, - "26": 9, - "19": 10, - "18": 11, - "38": 12, - "25": 13, - "50": 14, - "35": 15, - "49": 16, - "33": 17, - "52": 18, - "42": 19, - "58": 20, - "59": 21, - "27": 22, - "22": 23, - "57": 24, - "30": 25, - "53": 26, - "28": 27, - "29": 28, - "34": 29, - "36": 30, - "14": 31, - "41": 32, - "40": 33, - "48": 34, - "55": 35, - "60": 36, - "56": 37, - "17": 38, - "23": 39, - "51": 40, - "37": 41, - "72": 42, - "67": 43, - "54": 44, - "69": 45, - "73": 46, - "39": 47, - "12": 48, - "62": 49, - "46": 50, - "13": 51, - "75": 52, - "61": 53, - "76": 54, - "74": 55, - "68": 56, - "16": 57, - "71": 58, - "83": 59, - "47": 60, - "24": 61, - "70": 62, - "32": 63, - "78": 64, - "66": 65, - "64": 66, - "11": 67, - "31": 68, - "65": 69, - "63": 70, - "84": 71, - "81": 72, - "7": 73, - "88": 74, - "10": 75, - "8": 76, - "77": 77, - "85": 78, - "79": 79, - "86": 80, - "82": 81, - "9": 82, - "87": 83, - "80": 84, - "89": 85, - "90": 86, - "91": 87, - "95": 88, - "96": 89, - "94": 90, - "6": 91, - "92": 92, - "97": 93 - }, - "bottom": { - "": 0, - "4": 1, - "6": 2, - "21": 3, - "22": 4, - "45": 5, - "44": 6, - "16": 7, - "19": 8, - "30": 9, - "50": 10, - "53": 11, - "27": 12, - "58": 13, - "46": 14, - "35": 15, - "20": 16, - "34": 17, - "43": 18, - "39": 19, - "23": 20, - "60": 21, - "36": 22, - "31": 23, - "38": 24, - "56": 25, - "15": 26, - "28": 27, - "59": 28, - "62": 29, - "52": 30, - "40": 31, - "42": 32, - "25": 33, - "26": 34, - "73": 35, - "61": 36, - "51": 37, - "57": 38, - "54": 39, - "37": 40, - "49": 41, - "24": 42, - "13": 43, - "17": 44, - "75": 45, - "68": 46, - "29": 47, - "70": 48, - "47": 49, - "41": 50, - "55": 51, - "77": 52, - "72": 53, - "69": 54, - "48": 55, - "14": 56, - "65": 57, - "74": 58, - "71": 59, - "18": 60, - "67": 61, - "84": 62, - "63": 63, - "76": 64, - "86": 65, - "8": 66, - "11": 67, - "79": 68, - "33": 69, - "32": 70, - "82": 71, - "78": 72, - "66": 73, - "64": 74, - "12": 75, - "80": 76, - "83": 77, - "10": 78, - "89": 79, - "88": 80, - "85": 81, - "87": 82, - "9": 83, - "81": 84, - "90": 85, - "91": 86, - "92": 87, - "96": 88, - "97": 89, - "95": 90, - "5": 91, - "93": 92, - "7": 93, - "98": 94 - }, - "x0:delta": { - "": 0, - "0": 1, - "1": 2, - "-4": 3, - "-2": 4, - "-3": 5, - "-1": 6, - "-6": 7, - "2": 8, - "3": 9, - "-5": 10, - "4": 11, - "5": 12 - }, - "x1:delta": { - "": 0, - "0": 1, - "1": 2, - "-4": 3, - "-3": 4, - "-2": 5, - "-1": 6, - "-6": 7, - "2": 8, - "3": 9, - "-5": 10, - "4": 11, - "5": 12 - }, - "top:delta": { - "": 0, - "0": 1, - "1": 2, - "4": 3, - "2": 4, - "5": 5, - "3": 6, - "6": 7, - "9": 8, - "8": 9 - }, - "bottom:delta": { - "": 0, - "0": 1, - "1": 2, - "4": 3, - "2": 4, - "5": 5, - "3": 6, - "6": 7, - "9": 8 - }, - "x0:delta:delta": { - "": 0, - "0": 1, - "-1": 2, - "1": 3 - }, - "x1:delta:delta": { - "": 0, - "0": 1, - "1": 2, - "-1": 3 - }, - "top:delta:delta": { - "": 0, - "0": 1, - "1": 2, - "-1": 3 - }, - "bottom:delta:delta": { - "": 0, - "0": 1, - "1": 2, - "-1": 3 - }, - "line:indent": { - "": 0, - "0": 1, - "-2": 2, - "2": 3, - "3": 4, - "-3": 5, - "4": 6, - "-5": 7, - "-4": 8, - "1": 9, - "-6": 10, - "-7": 11, - "7": 12, - "5": 13, - "6": 14, - "-9": 15, - "11": 16, - "-18": 17, - "9": 18, - "-11": 19, - "-17": 20, - "18": 21, - "-1": 22, - "13": 23, - "-15": 24, - "-14": 25, - "-8": 26, - "-10": 27, - "-19": 28, - "-28": 29, - "-22": 30, - "-21": 31, - "-12": 32, - "-16": 33, - "12": 34, - "8": 35, - "-24": 36, - "-20": 37, - "-30": 38 - }, - "line:gap": { - "": 0, - "0": 1, - "1": 2, - "2": 3, - "4": 4, - "3": 5, - "5": 6, - "7": 7, - "8": 8, - "62": 9, - "6": 10, - "44": 11, - "10": 12, - "14": 13, - "18": 14, - "20": 15, - "15": 16, - "11": 17, - "-1": 18, - "42": 19, - "24": 20, - "16": 21, - "31": 22, - "59": 23, - "-26": 24, - "19": 25, - "46": 26, - "63": 27, - "61": 28, - "70": 29, - "22": 30, - "-4": 31, - "34": 32, - "17": 33, - "9": 34, - "67": 35, - "29": 36, - "64": 37, - "75": 38, - "28": 39, - "13": 40, - "45": 41, - "60": 42, - "26": 43, - "51": 44, - "47": 45, - "12": 46, - "-2": 47, - "32": 48, - "43": 49, - "23": 50 - }, - "line:height": { - "": 0, - "1": 1, - "2": 2, - "3": 3 - } - }, - "id2label": [ - "O", - "I-Titre", - "I-Tete", - "I-TOC", - "I-SousSection", - "I-Section", - "I-Pied", - "I-Liste", - "I-Figure", - "I-Chapitre", - "I-Article", - "I-Annexe", - "I-Alinea", - "B-Titre", - "B-Tete", - "B-TOC", - "B-SousSection", - "B-Section", - "B-Pied", - "B-Liste", - "B-Figure", - "B-Chapitre", - "B-Article", - "B-Annexe", - "B-Alinea" - ], - "featdims": { - "lower": 32, - "fontname": 8, - "rgb": 8, - "mctag": 8, - "element": 8, - "x0": 8, - "x1": 8, - "top": 8, - "bottom": 8, - "x0:delta": 8, - "x1:delta": 8, - "top:delta": 8, - "bottom:delta": 8, - "x0:delta:delta": 8, - "x1:delta:delta": 8, - "top:delta:delta": 8, - "bottom:delta:delta": 8, - "line:indent": 8, - "line:gap": 8, - "line:height": 8 - }, - "veclen": 11, - "label_weights": [ - 1.0052219202795956, - 1.001308044287131, - 1.0002590338024633, - 1.0001663201659368, - 1.0027510299042244, - 1.0023337212291115, - 1.0023391802211747, - 1.0000284628630776, - 1.0350841819432672, - 1.0033840915305303, - 1.0003149110350298, - 1.0148145458516258, - 1.0000225603772086, - 1.0076045262294093, - 1.0036968534684156, - 1.0444372889067748, - 1.0198013395064833, - 1.0115605656264237, - 1.0035273332098222, - 1.0006201550188902, - 1.3956124250860895, - 1.0235283388907062, - 1.0014378142744038, - 1.1331484530668263, - 1.0007010164451893 - ], - "hidden_size": 80, - "labels": "literal", - "constrain": false -} \ No newline at end of file diff --git a/alexi/models/rnn_crf.pt b/alexi/models/rnn_crf.pt deleted file mode 100644 index 0484dfb..0000000 Binary files a/alexi/models/rnn_crf.pt and /dev/null differ diff --git a/alexi/segment.py b/alexi/segment.py index ab485fc..95651f3 100644 --- a/alexi/segment.py +++ b/alexi/segment.py @@ -2,28 +2,14 @@ import csv import itertools -import json import operator import re -from collections import Counter from enum import Enum from os import PathLike from pathlib import Path from typing import Any, Callable, Iterable, Iterator, Sequence, Union import joblib # type: ignore -import torch -from allennlp_light.modules.conditional_random_field import ( - ConditionalRandomFieldWeightTrans, -) -from tokenizers import Tokenizer # type: ignore -from torch import nn -from torch.nn.utils.rnn import ( - PackedSequence, - pack_padded_sequence, - pad_packed_sequence, - pad_sequence, -) from alexi.convert import FIELDNAMES from alexi.format import line_breaks @@ -32,7 +18,6 @@ FEATNAMES = [name for name in FIELDNAMES if name not in ("segment", "sequence")] DEFAULT_MODEL = Path(__file__).parent / "models" / "crf.joblib.gz" DEFAULT_MODEL_NOSTRUCT = Path(__file__).parent / "models" / "crf.vl.joblib.gz" -DEFAULT_RNN_MODEL = Path(__file__).parent / "models" / "rnn.pt" FeatureFunc = Callable[[Sequence[T_obj]], Iterator[list[str]]] @@ -291,7 +276,7 @@ def filter_tab(words: Iterable[T_obj]) -> Iterator[T_obj]: def retokenize( - words: Iterable[T_obj], tokenizer: "Tokenizer", drop: bool = False + words: Iterable[T_obj], tokenizer, drop: bool = False ) -> Iterator[T_obj]: """Refaire la tokenisation en alignant les traits et etiquettes.""" for widx, w in enumerate(words): @@ -312,6 +297,18 @@ def retokenize( yield wt +def detokenize(words: Iterable[T_obj]) -> Iterator[T_obj]: + """Defaire la retokenisation""" + widx = -1 + for w in words: + if w["word_id"] != widx: + widx = w["word_id"] + del w["token"] + del w["word_id"] + del w["token_id"] + yield w + + def load(paths: Iterable[PathLike]) -> Iterator[T_obj]: for p in paths: with open(Path(p), "rt") as infh: @@ -321,471 +318,6 @@ def load(paths: Iterable[PathLike]) -> Iterator[T_obj]: yield row -def make_fontname(fontname): - a, plus, b = fontname.partition("+") - if plus: - return b - return fontname - - -BBOX_FEATS = ["x0", "x1", "top", "bottom"] -DELTA_FEATS = [f"{f}:delta" for f in BBOX_FEATS] -DELTA_DELTA_FEATS = [f"{f}:delta" for f in DELTA_FEATS] -LINE_FEATS = ["line:indent", "line:gap", "line:height"] - - -def add_deltas(page): - prev = {} - for w in page: - for f in BBOX_FEATS: - delta = int(w[f]) - prev.setdefault(f, int(w[f])) - w[f"{f}:delta"] = str(round(delta / 10)) - prev[f] = int(w[f]) - prev = {} - for w in page: - for f in DELTA_FEATS: - delta = int(w[f]) - prev.setdefault(f, int(w[f])) - w[f"{f}:delta"] = str(round(delta / 10)) - prev[f] = int(w[f]) - - -def make_rnn_features( - page: Sequence[T_obj], - labels: str = "literal", -) -> tuple[list[T_obj], list[str]]: - crf_features = list( - dict((name, val) for name, _, val in (w.partition("=") for w in feats)) - for feats in layout_features(page) - ) - rnn_features = [] - maxdim = max(float(page[0][x]) for x in ("page_width", "page_height")) - prevnum = None - for f, w in zip(crf_features, page): - elements = w.get("tagstack", "Span").split(";") - text = w["text"] - fontname = make_fontname(w["fontname"]) - feats = { - "lower": text.lower(), - "token": w.get("token", ""), - "fontname": fontname, - "rgb": w.get("rgb", "#000"), - "mctag": w.get("mctag", "P"), - "element": elements[-1], - "first": f["first"], - "last": f["last"], - "uppercase": text.isupper(), - "title": text.istitle(), - "punc": bool(PUNC.match(text)), - "endpunc": bool(ENDPUNC.match(text)), - "multipunc": bool(MULTIPUNC.match(text)), - "numeric": text.isnumeric(), - "bold": ("bold" in fontname.lower()), - "italic": ("italic" in fontname.lower()), - } - bullets = {} - for pattern in Bullet: - m = pattern.value.match(text) - # By definition a bullet comes first in the line - if m and int(f["first"]): - bullets[pattern.name] = m.group(1) - feats["bullet"] = len(bullets) > 0 - sequential = 0 - if int(f["first"]): - if "NUMERIC" in bullets: - num = int(bullets["NUMERIC"]) - sequential = int(prevnum is None or num - prevnum == 1) - prevnum = num - elif "LOWER" in bullets: - num = ord(bullets["LOWER"]) - ord("a") - sequential = int(prevnum is None or num - prevnum == 1) - prevnum = num - # print(bool(sequential), text) - feats["sequential"] = sequential - for name in BBOX_FEATS: - val = float(w[name]) / maxdim * 100 - feats[name] = str(round(val)) - for name in LINE_FEATS: - val = float(f[name]) / maxdim * 100 - feats[name] = str(round(val)) - rnn_features.append(feats) - add_deltas(rnn_features) - rnn_labels = list(page2labels(page, labels)) - return rnn_features, rnn_labels - - -FEATNAMES = ( - [ - "lower", - "fontname", - "rgb", - "mctag", - "element", - ] - + BBOX_FEATS - + DELTA_FEATS - + DELTA_DELTA_FEATS - + LINE_FEATS -) - -# Note that these are all binary -VECNAMES = [ - "first", - "last", - "sequential", - "uppercase", - "punc", - "endpunc", - "multipunc", - "numeric", - "bold", - "italic", - "bullet", -] - - -def make_page_feats(feat2id, page, featdims): - return [ - ( - [feat2id[name].get(feats[name], 0) for name in featdims], - [float(feats[name]) for name in VECNAMES], - ) - for feats in page - ] - - -def make_page_labels(label2id, page): - return [label2id.get(tag, 0) for tag in page] - - -def make_rnn_data( - csvs: Iterable[Path], - word_dim: int = 32, - feat_dim: int = 8, - labels: str = "literal", - tokenizer: Union["Tokenizer", None] = None, - min_count: int = 5, -): - """Creer le jeu de donnees pour entrainer un modele RNN.""" - words = filter_tab(load(csvs)) - if tokenizer is not None: - words = retokenize(words, tokenizer, drop=True) - pages = split_pages(words) - X, y = zip(*(make_rnn_features(p, labels=labels) for p in pages)) - label_counts = Counter(itertools.chain.from_iterable(y)) - id2label = sorted(label_counts.keys(), reverse=True) - label2id = dict((label, idx) for (idx, label) in enumerate(id2label)) - feat2count: dict[str, Counter] = {name: Counter() for name in FEATNAMES} - if tokenizer is not None: - # FIXME: should use all tokens - feat2count["token"] = Counter() - for feats in itertools.chain.from_iterable(X): - for name, val in feats.items(): - if name in feat2count: - feat2count[name][val] += 1 - if tokenizer is not None: - del feat2count["lower"] - feat2id = {} - for name, counts in feat2count.items(): - ids = feat2id[name] = {"": 0} - for val, count in counts.most_common(): - if count < min_count: - break - if val not in ids: - ids[val] = len(ids) - # Eliminate features with only one embedding - if len(ids) == 1: - del feat2id[name] - # FIXME: Should go in train_rnn - featdims = dict( - (name, word_dim) if name == "lower" else (name, feat_dim) for name in feat2id - ) - all_data = [ - ( - make_page_feats(feat2id, page, featdims), - make_page_labels(label2id, tags), - ) - for page, tags in zip(X, y) - ] - - return all_data, featdims, feat2id, label_counts, id2label - - -def load_rnn_data( - iobs: Iterable[T_obj], - feat2id, - id2label, - featdims, - labels: str = "literal", -): - """Creer le jeu de donnees pour tester un modele RNN.""" - label2id = dict((label, idx) for (idx, label) in enumerate(id2label)) - pages = (make_rnn_features(p, labels=labels) for p in split_pages(iobs)) - all_data = [ - ( - make_page_feats(feat2id, page, featdims), - make_page_labels(label2id, tags), - ) - for page, tags in pages - ] - return all_data - - -def batch_sort_key(example): - features, labels = example - return -len(labels) - - -def pad_collate_fn(batch): - batch.sort(key=batch_sort_key) - # Don't use a list comprehension here so we can better understand - sequences_features = [] - sequences_vectors = [] - sequences_labels = [] - lengths = [] - for example in batch: - features, labels = example - feats, vector = zip(*features) - assert len(labels) == len(feats) - assert len(labels) == len(vector) - sequences_features.append(torch.LongTensor(feats)) - # sequences_vectors.append(torch.FloatTensor(np.array(vector) / vecmax)) - sequences_vectors.append(torch.FloatTensor(vector)) - sequences_labels.append(torch.LongTensor(labels)) - lengths.append(len(labels)) - lengths = torch.LongTensor(lengths) - padded_sequences_features = pad_sequence( - sequences_features, batch_first=True, padding_value=0 - ) - pack_padded_sequences_features = pack_padded_sequence( - padded_sequences_features, lengths.cpu(), batch_first=True - ) - padded_sequences_vectors = pad_sequence( - sequences_vectors, batch_first=True, padding_value=0 - ) - pack_padded_sequences_vectors = pack_padded_sequence( - padded_sequences_vectors, lengths.cpu(), batch_first=True - ) - padded_sequences_labels = pad_sequence( - sequences_labels, batch_first=True, padding_value=-100 - ) - mask = torch.ne(padded_sequences_labels, -100) - return ( - (pack_padded_sequences_features, pack_padded_sequences_vectors, mask), - padded_sequences_labels, - ) - - -def pad_collate_fn_predict(batch): - # Require data to be externally sorted by length for prediction - # (otherwise we have no idea which output corresponds to which input! WTF Poutyne!) - # Don't use a list comprehension here so we can better understand - sequences_features = [] - sequences_vectors = [] - sequences_labels = [] - lengths = [] - for example in batch: - features, labels = example - feats, vector = zip(*features) - assert len(labels) == len(feats) - assert len(labels) == len(vector) - sequences_features.append(torch.LongTensor(feats)) - # sequences_vectors.append(torch.FloatTensor(np.array(vector) / vecmax)) - sequences_vectors.append(torch.FloatTensor(vector)) - sequences_labels.append(torch.LongTensor(labels)) - lengths.append(len(labels)) - max_len = max(lengths) - len_lens = len(lengths) - lengths = torch.LongTensor(lengths).cpu() - # ought to be built into torch... - # https://stackoverflow.com/questions/53403306/how-to-batch-convert-sentence-lengths-to-masks-in-pytorch - mask = torch.arange(max_len).expand(len_lens, max_len) < lengths.unsqueeze(1) - padded_sequences_features = pad_sequence( - sequences_features, batch_first=True, padding_value=0 - ) - pack_padded_sequences_features = pack_padded_sequence( - padded_sequences_features, lengths.cpu(), batch_first=True - ) - padded_sequences_vectors = pad_sequence( - sequences_vectors, batch_first=True, padding_value=0 - ) - pack_padded_sequences_vectors = pack_padded_sequence( - padded_sequences_vectors, lengths.cpu(), batch_first=True - ) - return (pack_padded_sequences_features, pack_padded_sequences_vectors, mask) - - -class RNN(nn.Module): - def __init__( - self, - featdims, - feat2id, - veclen, - id2label, - label_weights=None, - hidden_size=64, - num_layer=1, - bidirectional=True, - **_kwargs, - ): - super().__init__() - self.hidden_state = None - self.embedding_layers = {} - self.featdims = featdims - for name in featdims: - self.embedding_layers[name] = nn.Embedding( - len(feat2id[name]), - featdims[name], - padding_idx=0, - ) - self.add_module(f"embedding_{name}", self.embedding_layers[name]) - dimension = sum(featdims.values()) + veclen - self.lstm_layer = nn.LSTM( - input_size=dimension, - hidden_size=hidden_size, - num_layers=num_layer, - bidirectional=bidirectional, - batch_first=True, - ) - self.output_layer = nn.Linear( - hidden_size * (2 if bidirectional else 1), len(id2label) - ) - - def forward( - self, - features: PackedSequence | torch.Tensor, - vectors: PackedSequence | torch.Tensor, - _mask: torch.Tensor, - ): - inputs: PackedSequence | torch.Tensor - # https://discuss.pytorch.org/t/how-to-use-pack-sequence-if-we-are-going-to-use-word-embedding-and-bilstm/28184 - if isinstance(features, PackedSequence): - # for idx, name in enumerate(self.featdims): - # print( - # "WTF", - # name, - # features.data[:, idx].min(), - # features.data[:, idx].max(), - # features.data[:, idx], - # ) - # _ = self.embedding_layers[name](features.data[:, idx]) - stack = [ - self.embedding_layers[name](features.data[:, idx]) - for idx, name in enumerate(self.featdims) - ] - stack.append(vectors.data) - inputs = torch.hstack(stack) - inputs = torch.nn.utils.rnn.PackedSequence(inputs, features.batch_sizes) - else: - assert len(features.shape) == 2 # FIXME: support batches - stack = [ - self.embedding_layers[name](features[:, idx]) - for idx, name in enumerate(self.featdims) - ] - stack.append(vectors) - inputs = torch.hstack(stack) - lstm_out, self.hidden_state = self.lstm_layer(inputs) - if isinstance(lstm_out, PackedSequence): - lstm_out, _ = pad_packed_sequence(lstm_out, batch_first=True) - # Make it a "batch" on output - if len(lstm_out.shape) == 2: - lstm_out = lstm_out.unsqueeze(0) - tag_space = self.output_layer(lstm_out) - tag_space = tag_space.transpose( - -1, 1 - ) # We need to transpose since it's a sequence (but why?!) - return tag_space - - -def bio_transitions(id2label): - """Constrain transitions (this is not actually useful)""" - labels_with_boundaries = list(id2label) - labels_with_boundaries.extend(("START", "END")) - - allowed = [] - for from_label_index, from_label in enumerate(labels_with_boundaries): - if from_label in ("START", "END"): - from_tag = from_label - from_entity = "" - else: - from_tag = from_label[0] - from_entity = from_label[1:] - for to_label_index, to_label in enumerate(labels_with_boundaries): - if to_label in ("START", "END"): - to_tag = to_label - to_entity = "" - else: - to_tag = to_label[0] - to_entity = to_label[1:] - if from_tag == "START": - if to_tag in ("O", "B"): - allowed.append((from_label_index, to_label_index)) - elif to_tag == "END": - if from_tag in ("O", "B", "I"): - allowed.append((from_label_index, to_label_index)) - elif any( - ( - # Can always transition to O or B-x - to_tag in ("O", "B"), - # Can only transition to I-x from B-x or I-x or from I to I - to_tag == "I" - and from_tag in ("B", "I") - and from_entity == to_entity, - # Can transition to I from B-x - to_tag == "I" and from_tag == "B" and to_entity == "", - ) - ): - allowed.append((from_label_index, to_label_index)) - return allowed - - -class RNNCRF(RNN): - def __init__( - self, - featdims, - feat2id, - veclen, - id2label, - label_weights=None, - hidden_size=64, - num_layer=1, - bidirectional=True, - constrain=False, - **_kwargs, - ): - super().__init__( - featdims, - feat2id, - veclen, - id2label, - label_weights, - hidden_size, - num_layer, - bidirectional, - ) - self.crf_layer = ConditionalRandomFieldWeightTrans( - num_tags=len(id2label), - label_weights=label_weights, - constraints=bio_transitions(id2label) if constrain else None, - ) - - def forward( - self, - features: PackedSequence | torch.Tensor, - vectors: PackedSequence | torch.Tensor, - mask: torch.Tensor, - ): - tag_space = super().forward(features, vectors, mask) - logits = tag_space.transpose(-1, 1) # We need to transpose it back because wtf - # Make it a "batch" or CRF gets quite irate - if len(logits.shape) == 2: - logits = logits.unsqueeze(0) - if len(mask.shape) == 1: - mask = mask.unsqueeze(0) - paths = self.crf_layer.viterbi_tags(logits, mask) - labels, _scores = zip(*paths) - return logits, labels, mask - - class Segmenteur: def __init__(self, model=DEFAULT_MODEL): self.crf, self.n, self.features, self.labels = joblib.load(model) @@ -801,42 +333,3 @@ def __call__(self, words: Iterable[dict[str, Any]]) -> Iterable[dict[str, Any]]: for label, word in zip(pred, c2): word["segment"] = label yield word - - -class RNNSegmenteur(Segmenteur): - model: RNN - - def __init__(self, model: PathLike = DEFAULT_RNN_MODEL, device="cpu"): - model = Path(model) - self.device = torch.device(device) - with open(model.with_suffix(".json"), "rt") as infh: - self.config = json.load(infh) - if "crf" in model.name: - self.model = RNNCRF(**self.config) - else: - self.model = RNN(**self.config) - self.model.load_state_dict(torch.load(model, map_location=torch.device("cpu"))) - self.model.eval() - self.model.to(device) - - def __call__(self, words: Iterable[dict[str, Any]]) -> Iterable[dict[str, Any]]: - for p in split_pages(words): - page, _labels = make_rnn_features(p) - features = make_page_feats( - self.config["feat2id"], page, self.model.featdims - ) - feats, vector = zip(*features) - batch = ( - torch.LongTensor(feats, device=self.device), - torch.FloatTensor(vector, device=self.device), - torch.ones(len(feats), device=self.device), - ) - out = self.model(*batch) - if isinstance(out, tuple): # is a crf - _, (labelgen,), _ = out - else: - # Encore WTF - labelgen = out.transpose(1, -1).argmax(-1)[0].cpu() - for label_id, word in zip(labelgen, p): - word["segment"] = self.config["id2label"][label_id] - yield word diff --git a/pyproject.toml b/pyproject.toml index b9d277c..592bd53 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,11 +18,8 @@ dependencies = [ "pdfplumber", "scikit-learn", "sklearn-crfsuite", - "tokenizers", "lunr[languages]", "unidecode", - "torch", - "allennlp-light", # it is not very light ] [project.optional-dependencies] dev = [ @@ -34,7 +31,6 @@ dev = [ "coverage", "pytest", "pytest-cov", - "poutyne", ] [project.scripts] @@ -70,10 +66,6 @@ format = [ "isort alexi test", ] train = [ -# """python scripts/train_rnn.py --nepoch 45 -o alexi/models/rnn.pt \\ -# data/*.csv data/patches/*.csv""", -# """python scripts/train_rnn_crf.py --nepoch 1 --freeze -i alexi/models/rnn.pt \\ -# -o alexi/models/rnn_crf.pt data/*.csv data/patches/*.csv""", """python scripts/train_crf.py \\ --features text+layout+structure --labels bonly \\ --outfile alexi/models/crf.joblib.gz \\ diff --git a/test/test_segment.py b/test/test_segment.py index e4d2440..a5356db 100644 --- a/test/test_segment.py +++ b/test/test_segment.py @@ -59,5 +59,5 @@ def test_retokenize(): tokenizer = MockTokenizer() retokenized = retokenize(iobs, tokenizer) assert iobs != retokenized - detokenized = list(detokenize(retokenized, tokenizer)) + detokenized = list(detokenize(retokenized)) assert iobs == detokenized