From 797fa668a4ddecb239411d5cd884c9fe8736fa79 Mon Sep 17 00:00:00 2001
From: Emerson Rocha <rocha@ieee.org>
Date: Mon, 22 Nov 2021 13:48:36 -0300
Subject: [PATCH] linguacodex.py (#4): bcp47_langtag() already works for
 typical langtag

---
 scripts/fn/linguacodex.py | 65 +++++++++++++++++++++++++++++++++++----
 1 file changed, 59 insertions(+), 6 deletions(-)

diff --git a/scripts/fn/linguacodex.py b/scripts/fn/linguacodex.py
index ca5ec5d..b393813 100755
--- a/scripts/fn/linguacodex.py
+++ b/scripts/fn/linguacodex.py
@@ -468,7 +468,7 @@ def in_jq(rem, quod: str = '.', incognitum: Any = '?!?'):
     return neo_rem
 
 
-def bcp47_langtag(rem: str, item: str = None) -> dict:
+def bcp47_langtag(rem: str, item: str = None, strict: bool = True) -> dict:
     """Public domain python function to process BCP47 langtag
 
     Created at 2021-11-22. Partial implementation of BCP47 (RFC 5646).
@@ -476,6 +476,8 @@ def bcp47_langtag(rem: str, item: str = None) -> dict:
 
     Args:
         rem (str): The BCP47 langtag
+        item (str): Specific value to return instead of full information
+        strict (bool): Throw exceptions. False replace values with False
 
     Returns:
         dict: Python dictionary. None means not found. False means the feature
@@ -567,6 +569,14 @@ def bcp47_langtag(rem: str, item: str = None) -> dict:
 
         >>> bcp47_langtag('pt-Latn-BR', 'language')
         'pt'
+        >>> bcp47_langtag('pt-Latn-BR', 'script')
+        'Latn'
+        >>> bcp47_langtag('pt-Latn-BR', 'region')
+        'BR'
+        >>> bcp47_langtag('es-419', 'region')
+        '419'
+        >>> bcp47_langtag('zh-Latn-CN-variant1-a-extend1-x-wadegile-private1', 'region')
+        'CN'
     """
     result = {
         'langtag': rem,
@@ -581,13 +591,56 @@ def bcp47_langtag(rem: str, item: str = None) -> dict:
 
     parts = rem.replace('_', '-').strip().split('-')
 
-    # for part in parts:
-    #     if result['language'] is None:
-            
-    result['language'] = parts[0]
+    for part in parts:
+        if result['language'] is None:
+            if part.isalnum() and len(part) == 2 or len(part) == 3:
+                result['language'] = part.lower()
+            else:
+                if not strict:
+                    result['language'] = False
+                else:
+                    raise ValueError(rem + 'language?')
+            continue
+
+        if len(part) == 4:
+            if part.isalpha() and result['script'] is None:
+                if result['region'] is None and len(result['privateuse']) == 0:
+                    result['script'] = part.capitalize()
+                else:
+                    if not strict:
+                        result['script'] = False
+                    else:
+                        raise ValueError(
+                            rem + 'script after region/privateuse')
+            else:
+                if not strict:
+                    result['script'] = False
+                else:
+                    raise ValueError(rem + 'script?')
+            continue
+
+        if len(part) == 2:
+            if part.isalpha() and result['region'] is None:
+                result['region'] = part.upper()
+            else:
+                if not strict:
+                    result['region'] = False
+                else:
+                    raise ValueError(rem + 'region?')
+
+        if len(part) == 3:
+            if part.isnumeric() and result['region'] is None:
+                result['region'] = part
+            else:
+                if not strict:
+                    result['region'] = False
+                else:
+                    raise ValueError(rem + 'region?')
+            #pass
+
+    # result['language'] = parts[0]
 
     # Stritly speaking, we shoudl check if is alpha2 or alpha3
-    
 
     if item != None:
         return result[item]