Merge pull request #21 from dahosek/develop

1.3.0: Add support for Unicode 1.6.0, updates to license info and doc…
dahosek · Oct 29, 2024 · 09feefa · 09feefa
2 parents d944a3f + 0431221
commit 09feefa
Show file tree

Hide file tree

Showing 10 changed files with 8,733 additions and 3,108 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,22 +1,23 @@
 [package]
 name = "finl_unicode"
-version = "1.2.0"
+version = "1.3.0"
 edition = "2021"
-license = "MIT OR Apache-2.0"
+license = "(MIT OR Apache-2.0) AND Unicode-DFS-2016"
 keywords = ["unicode", "segmentation", "graphemes"]
 categories = ["text-processing", "internationalization"]
 description = "Library for handling Unicode functionality for finl (categories and grapheme segmentation)"
 homepage = "https://finl.xyz"
 repository = "https://github.com/dahosek/finl_unicode"
+exclude = ["resources"]
 
 [dependencies]
 
 [dev-dependencies]
-criterion = { version = "0.3.5", features=["html_reports"]}
+criterion = { version = "0.5.1", features=["html_reports"]}
 unicode_categories = "0.1.1"
 finl_unicode = {path=".", features=["grapheme_clusters", "categories"]}
-unicode-segmentation = "1.9.0"
-bstr = "1.0.0"
+unicode-segmentation = "1.12.0"
+bstr = "1.10.0"
 
 [features]
 default = ["categories", "grapheme_clusters"]

diff --git a/LICENSE-UNICODE b/LICENSE-UNICODE
@@ -0,0 +1,39 @@
+UNICODE LICENSE V3
+
+COPYRIGHT AND PERMISSION NOTICE
+
+Copyright © 1991-2023 Unicode, Inc.
+
+NOTICE TO USER: Carefully read the following legal agreement. BY
+DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR
+SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
+TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT
+DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of data files and any associated documentation (the "Data Files") or
+software and any associated documentation (the "Software") to deal in the
+Data Files or Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, and/or sell
+copies of the Data Files or Software, and to permit persons to whom the
+Data Files or Software are furnished to do so, provided that either (a)
+this copyright and permission notice appear with all copies of the Data
+Files or Software, or (b) this copyright and permission notice appear in
+associated Documentation.
+
+THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
+THIRD PARTY RIGHTS.
+
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
+BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
+OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
+ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA
+FILES OR SOFTWARE.
+
+Except as contained in this notice, the name of a copyright holder shall
+not be used in advertising or otherwise to promote the sale, use or other
+dealings in these Data Files or Software without prior written
+authorization of the copyright holder.
diff --git a/README.md b/README.md
@@ -26,24 +26,24 @@ I did benchmarks comparing my code against existing crates and discovered that I
 All benchmarks are generated using Criterion You can replicate them by running `cargo bench` from the project directory. Three numbers are given for all results: low/mean/high, all from the output of Criterion. The mean value is given in **bold**. 
 
 #### Unicode categories
-I ran three benchmarks to compare the performance of the crates. 
+I ran three benchmarks to compare the performance of the crates on my M3 Max MacBook Pro. 
 The Japanese text benchmark reads the Project Gutenberg EBook of *Kumogata monsho* by John Falkner and counts the characters in it which are Unicode letters.
 The Czech text benchmark reads the Project Gutenberg EBook of *Cítanka pro skoly obecné* by Jan Stastný and Jan Lepar and Josef Sokol (this was to exercise testing against a Latin-alphabet text with lots of diacriticals). 
-All letters and lowercase letters are counted.
+All letters are counted in the first benchmark and lowercase letters only are counted in the second.
 The English text benchmark reads the Project Gutenberg eBook of *Frankenstein* by Mary Wollstonecraft Shelley (to run against a text which is pure ASCII).
-All letters and lowercase letters are counted. The source code check is from neovim. Again, letters and lowercase letters are counted in the sample.
+All letters and lowercase letters are counted in two benchmarks as with the Czech text. The source code check is from neovim. Again, letters and lowercase letters are counted in the sample.
 
 I compared against [unicode_categories](https://docs.rs/unicode_categories/latest/unicode_categories/) 0.1.1. All times are in ms. Smaller is better.
 
 | Benchmark                | `finl_unicode`              | `unicode_categories`     |
 |--------------------------|-----------------------------|--------------------------|
-| Japanese text            | 0.62484/**0.64200**/0.66311 | 15.382/**15.719**/16.092 |
-| Czech text               | 0.18248/**0.19137**/0.19975 | 3.2322/**3.3329**/3.4435 |
-| Czech text (lowercase)   | 0.20361/**0.20529**/0.20724 | 1.8496/**1.8742**/1.9026 |
-| English text             | 0.52260/**0.54461**/0.56682 | 13.038/**13.330**/13.655 |
-| English text (lowercase) | 0.72885/**0.74219**/0.75668 | 8.3998/**8.5037**/8.6233 |
-| Source code              | 0.05544/**0.05785**/0.06046 | 1.6512/**1.7063**/1.7656 |
-| Source code (lowercase)  | 0.07506/**0.07673**/0.07895 | 0.7285/**0.7536**/0.7821 | 
+| Japanese text            | 0.26318/**0.26356**/0.26397 | 11.055/**11.071**/11.088 |
+| Czech text               | 0.07618/**0.07631**/0.07645 | 2.6268/**2.6293**/2.6316 |
+| Czech text (lowercase)   | 0.07601/**0.07614**/0.07626 | 1.4984/**1.4999**/1.5014 |
+| English text             | 0.24668/**0.24693**/0.24723 | 11.173/**11.185**/11.195 |
+| English text (lowercase) | 0.24682/**0.24707**/0.24735 | 7.8968/**7.9050**/7.9127 |
+| Source code              | 0.02738/**0.02745**/0.02753 | 1.5738/**1.5760**/1.5787 |
+| Source code (lowercase)  | 0.02733/**0.02735**/0.02738 | 0.7285/**0.7536**/0.7821 | 
 
 As you can see, this is a clear win (the difference is the choice of algorithm. `finl_unicode` uses two-step table lookup to be able to store categories compactly while `unicode_categories` uses a combination of range checks and binary searches on tables).
 
@@ -55,24 +55,20 @@ texts that were part of the `unicode_segmentation` benchmark suite.
 
 All times are in µs, smaller is better.
 
-| Benchmark        | `finl_unicde`            | `unicode_segmentation`   | `bstr`                   |
-|------------------|--------------------------|--------------------------|--------------------------|
-| Unicode graphemes | 130.34/**133.31**/137.00 | 209.51/**217.50**/225.53 | 337.68/**354.59**/372.75 |
-| Arabic text      | 262.05/**268.78**/273.65 | 443.11/**463.19**/482.25 | 842.78/**872.47**/906.84 |
-| English text     | 387.88/**395.08**/404.00 | 527.29/**552.92**/586.04 | 424.73/**437.04**/449.23 |
-| Hindi text       | 204.88/**216.04**/228.14 | 489.75/**500.55**/512.20 | 638.01/**641.28**/644.87 |
-| Japanese text    | 181.65/**190.87**/202.92 | 437.98/**451.51**/467.17 | 855.04/**880.48**/904.88 |
-| Korean text      | 298.19/**304.42**/312.47 | 813.45/**844.54**/880.53 | 1259.2/**1304.7**/1350.6 |
-| Mandarin text    | 154.55/**159.33**/164.22 | 284.59/**293.63**/306.59 | 679.67/**704.13**/730.46 |
-| Russian text     | 300.56/**312.86**/327.44 | 372.59/**392.12**/419.40 | 783.41/**838.96**/896.44 |
-| Source code      | 424.39/**443.88**/463.77 | 501.16/**506.81**/513.27 | 513.79/**531.82**/551.31 |
-
-Adding some additional tests reveals some interesting contrasts in performance. On text with minimal
-clustering (English and source code), my code is faster than `unicode_segmentation` and `bstr` (but not dramatically so) and it's
-interesting to see that `bstr` is slightly faster than `unicode_segmentation` on the English text benchmark,
-but where grapheme clusters become more common (Arabic and Hindi), the performance is dramatically better 
-with my crate. I wouldn’t expect clusters in the Japanese, but it and Korean show the most dramatic
-differences in performance.
+| Benchmark         | `finl_unicde`            | `unicode_segmentation`   | `bstr`                   |
+|-------------------|--------------------------|--------------------------|--------------------------|
+| Unicode graphemes | 63.692/**63.813**/63.948 | 323.64/**324.08**/324.47 | 273.24/**273.87**/274.63 |
+| Arabic text       | 123.67/**124.02**/124.41 | 544.88/**545.97**/547.05 | 1055.7/**1057.8**/1059.8 |
+| English text      | 164.48/**164.56**/164.65 | 1057.6/**1061.1**/1064.7 | 349.35/**349.79**/350.26 |
+| Hindi text        | 94.467/**94.665**/94.865 | 604.75/**605.38**/606.01 | 838.03/**840.19**/842.23 |
+| Japanese text     | 70.491/**70.573**/70.685 | 451.89/**452.88**/453.88 | 997.97/**1000.5**/1003.4 |
+| Korean text       | 161.34/**161.79**/162.24 | 600.55/**602.49**/604.49 | 1291.9/**1293.5**/1295.1 |
+| Mandarin text     | 67.667/**67.792**/67.941 | 387.86/**388.61**/389.37 | 919.42/**920.86**/922.38 |
+| Russian text      | 127.03/**127.30**/127.60 | 609.74/**610.91**/612.12 | 873.43/**877.29**/881.24 |
+| Source code       | 176.73/**178.05**/180.91 | 1067.4/**1070.8**/1074.4 | 494.43/**495.96**/497.62 |
+
+With the move from benchmarking on Intel to Apple Silicon, the performance difference for my code versus the other
+libraries was generally expanded. I’m curious as to explanations for why this might happen.
 
 ## Why not?
 
@@ -104,6 +100,7 @@ I guarantee no warranty or support, although if you care to throw some money my
 - **1.0.2** More changes because the first round apparently weren’t enough
 - **1.1.0** Add support for Unicode 15.0.0, added new benchmark comparisons.
 - **1.2.0** Allow grapheme clustering to work on any `Peekable` iterator over `char` or `(usize,char)`.
+- **1.3.0** Add support for Unicode 16.0.0 (significant changes required for Indic Conjunct clusters), update license documentation and benchmark comparisons.
 
 ---
 

diff --git a/generate-sources/Cargo.toml b/generate-sources/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "generate-sources"
-version = "1.0.2"
+version = "1.3.0"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 description = "Utility program to generate sources from Unicode data"

diff --git a/generate-sources/src/main.rs b/generate-sources/src/main.rs
@@ -9,7 +9,7 @@ use reqwest::blocking::Client;
 use itertools::Itertools;
 
 fn main() -> anyhow::Result<()> {
-    let unicode_version = "15.0.0";
+    let unicode_version = "16.0.0";
     let mut out_dir = env::var_os("CARGO_MANIFEST_DIR").unwrap();
     out_dir.push("/target/tmp/");
     if !Path::new(&out_dir).try_exists()? {
@@ -25,6 +25,7 @@ fn main() -> anyhow::Result<()> {
     let unicode_data_txt = data_dir.join("UnicodeData.txt");
     let grapheme_break_test_txt = data_dir.join("GraphemeBreakTest.txt");
     let grapheme_break_property_txt = data_dir.join("GraphemeBreakProperty.txt");
+    let derived_core_properties_txt = data_dir.join("DerivedCoreProperties.txt");
     let emoji_data_txt = data_dir.join("emoji-data.txt");
 
 
@@ -40,8 +41,10 @@ fn main() -> anyhow::Result<()> {
     download_unicode_data(&grapheme_break_property_txt, "ucd/auxiliary/GraphemeBreakProperty.txt", unicode_version)?;
     eprintln!("Downloading emoji data...");
     download_unicode_data(&emoji_data_txt, "ucd/emoji/emoji-data.txt", unicode_version)?;
+    eprintln!("Downloading derived core properties...");
+    download_unicode_data(&derived_core_properties_txt, "ucd/DerivedCoreProperties.txt", unicode_version)?;
     eprintln!("Generating grapheme break data...");
-    build_grapheme_break_property(&code_dir, &grapheme_break_property_txt, &emoji_data_txt)?;
+    build_grapheme_break_property(&code_dir, &grapheme_break_property_txt, &emoji_data_txt, &derived_core_properties_txt)?;
     Ok(())
 }
 
@@ -254,13 +257,15 @@ fn str_to_range(range: &str) -> RangeInclusive<usize> {
     }
 }
 
-fn build_grapheme_break_property(out_dir: &OsString, grapheme_break_property_txt: &PathBuf, emoji_data_txt: &PathBuf) -> anyhow::Result<()> {
+fn build_grapheme_break_property(out_dir: &OsString, grapheme_break_property_txt: &PathBuf, emoji_data_txt: &PathBuf, derived_core_properties_txt: &PathBuf) -> anyhow::Result<()> {
     let grapheme_property_rs = Path::new(out_dir).join("grapheme_property.rs");
     let grapheme_property_rs = File::create(grapheme_property_rs)?;
     let grapheme_break_property = File::open(grapheme_break_property_txt)?;
     let grapheme_break_property = BufReader::new(grapheme_break_property);
     let emoji_data = File::open(emoji_data_txt)?;
     let emoji_data = BufReader::new(emoji_data);
+    let derived_core_properties = File::open(derived_core_properties_txt)?;
+    let derived_core_properties = BufReader::new(derived_core_properties);
 
     // first pass: build an array of all the properties
     let mut raw_grapheme_properties = [0u8;0x110000];
@@ -289,31 +294,26 @@ fn build_grapheme_break_property(out_dir: &OsString, grapheme_break_property_txt
         }
     }
 
+    // update conjunct cluster characteristics
+    // We set the high nibble to 2x for consonants and 1x for linkers
+    for line in derived_core_properties.lines() {
+        let line = line.unwrap();
+        if let Some((line, _)) = line.split_once('#') {
+            if let Some((range, property)) = line.split_once(';') {
+                let range = range.trim();
+                let property = property.trim();
+                if property == "InCB; Linker" {
+                    raw_grapheme_properties.get_mut(str_to_range(range)).unwrap().iter_mut().for_each(|x|  *x |= 0x10);
+
+                }
+                if property == "InCB; Consonant" {
+                    raw_grapheme_properties.get_mut(str_to_range(range)).unwrap().iter_mut().for_each(|x| *x |= 0x20);
+                }
+            }
+        }
+    }
+
     write_data_tables(grapheme_property_rs, &raw_grapheme_properties, "GP_TABLE", "GP_PAGES")
-    // Then we break it down into pages (wrapping the result with a bit of Rust boilerplate)
-    // writeln!(grapheme_property_rs, "// GENERATED CODE DO NOT MANUALLY EDIT")?;
-    // writeln!(grapheme_property_rs, "pub const GP_TABLE: [u8;0x1100] = [")?;
-    // let mut page_index = HashMap::new();
-    // let mut page_number = 0;
-    // for page in 0 .. 0x1100 {
-    //     let page_start = page << 8;
-    //     let page_data  = raw_grapheme_properties[page_start..page_start+0x100].to_vec();
-    //     let &mut page_ref = page_index.entry(page_data).or_insert(page_number);
-    //     if page_ref == page_number {
-    //         page_number += 1
-    //     }
-    //     writeln!(grapheme_property_rs, "\t {page_ref}, // {page:#x}")?;
-    // }
-    // writeln!(grapheme_property_rs, "];")?;
-    //
-    // let cat_pages = page_index.iter()
-    //     .map(|(k, v)| (v,k))
-    //     .sorted_by(|(a,_),(b,_)| Ord::cmp(a,b))
-    //     .map(|(_, page)| page )
-    //     .collect_vec();
-    // writeln!(grapheme_property_rs, "pub const GP_PAGES: [[u8;256];{}] = {cat_pages:#x?};", cat_pages.len())?;
-    //
-    // Ok(())
 }
 
 fn write_data_tables(mut rust_file : File, raw_data: &[u8], table_name: &str, pages_name: &str) -> anyhow::Result<()> {