From 9bef4045fc8299bb74daa40b8b88e09b9929de10 Mon Sep 17 00:00:00 2001
From: Ryan Peach <rgpeach10@gmail.com>
Date: Tue, 17 Dec 2024 01:18:44 -0500
Subject: [PATCH 1/3] Changes from
 https://github.com/ryanpeach/mdlinker/pull/54

---
 DEBUGGING.md                 |  7 +++++
 Justfile                     | 10 +++++++
 bin/byte_index               | 52 ++++++++++++++++++++++++++++++++++++
 src/file/content/wikilink.rs | 36 ++++++++++++-------------
 src/rules/unlinked_text.rs   | 30 ++++++++++-----------
 5 files changed, 102 insertions(+), 33 deletions(-)
 create mode 100644 DEBUGGING.md
 create mode 100755 bin/byte_index

diff --git a/DEBUGGING.md b/DEBUGGING.md
new file mode 100644
index 0000000..a7b1c34
--- /dev/null
+++ b/DEBUGGING.md
@@ -0,0 +1,7 @@
+# rust-lldb
+
+https://dev.to/bmatcuk/debugging-rust-with-rust-lldb-j1f
+
+Run `just test-debug` to run the tests in debug mode using lldb.
+
+Now use `r <test_name>` to run a specific test. https://users.rust-lang.org/t/running-a-single-test-under-a-debugger/44460
diff --git a/Justfile b/Justfile
index c282b22..7601a70 100644
--- a/Justfile
+++ b/Justfile
@@ -4,3 +4,13 @@ test:
 
 test-print test_name:
     RUNNING_TESTS=true RUST_LOG=trace RUST_BACKTRACE=1 cargo test -- --test-threads=1 {{test_name}}
+
+[macos]
+test-debug test_name breakpoint:
+    #!/bin/bash
+    TEST_OUTPUT=$(RUNNING_TESTS=true cargo test --no-run 2>&1 >/dev/null)
+    DEP1=$(echo $TEST_OUTPUT | grep -ohe 'Executable tests/logseq/main.rs (target/debug/deps/logseq-[a-z0-9]*' | awk -F'[()]' '{print $2}')
+    echo $DEP1
+    RUNNING_TESTS=true RUST_LOG=debug RUST_BACKTRACE=full rust-lldb $DEP1 \
+        -o "b {{breakpoint}}" \
+        -o "r {{test_name}}"
diff --git a/bin/byte_index b/bin/byte_index
new file mode 100755
index 0000000..607b4ed
--- /dev/null
+++ b/bin/byte_index
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+
+import argparse
+
+def find_byte_indexes(text, search_term):
+    # Encode both the text and the search term into bytes
+    byte_text = text.encode('utf-8')
+    byte_search_term = search_term.encode('utf-8')
+
+    # Initialize a list to store the byte indexes
+    byte_indexes = []
+
+    # Start searching for the term in the byte_text
+    index = byte_text.find(byte_search_term)
+    while index != -1:
+        byte_indexes.append(index)
+        # Continue searching after the current match
+        index = byte_text.find(byte_search_term, index + 1)
+
+    return byte_indexes
+
+def main():
+    # Set up argument parser
+    parser = argparse.ArgumentParser(description="Find byte indexes of a search term in a file.")
+    parser.add_argument("file", help="Path to the file to be searched")
+    parser.add_argument("search_term", help="The term to search for in the file")
+
+    # Parse the arguments
+    args = parser.parse_args()
+
+    # Read the file
+    try:
+        with open(args.file, 'r', encoding='utf-8') as f:
+            file_content = f.read()
+    except FileNotFoundError:
+        print(f"Error: File '{args.file}' not found.")
+        return
+    except Exception as e:
+        print(f"Error reading file: {e}")
+        return
+
+    # Find byte indexes
+    indexes = find_byte_indexes(file_content, args.search_term)
+
+    # Print the results
+    if indexes:
+        print(f"Found '{args.search_term}' at byte indexes: {indexes}")
+    else:
+        print(f"'{args.search_term}' not found in the file.")
+
+if __name__ == "__main__":
+    main()
diff --git a/src/file/content/wikilink.rs b/src/file/content/wikilink.rs
index 6199186..78cdf6e 100644
--- a/src/file/content/wikilink.rs
+++ b/src/file/content/wikilink.rs
@@ -34,7 +34,7 @@ impl Alias {
         self.0.is_empty()
     }
     #[must_use]
-    pub fn len(&self) -> usize {
+    pub fn char_len(&self) -> usize {
         self.0.chars().count()
     }
 }
@@ -104,26 +104,26 @@ impl Visitor for WikilinkVisitor {
                         .expect("Otherwise the regex wouldn't match")
                         .as_str(),
                 );
+                let capture_start_byte = captures
+                    .get(1)
+                    .expect("The regex has 2 capture groups")
+                    .start();
+                let text_without_frontmatter = remove_frontmatter_from_source(source, node);
+                let sourcepos_start_offset_bytes = SourceOffset::from_location(
+                    text_without_frontmatter,
+                    sourcepos.start.line,
+                    sourcepos.start.column,
+                )
+                .offset();
+                let span = SourceSpan::new(
+                    (sourcepos_start_offset_bytes + capture_start_byte).into(),
+                    alias.char_len(),
+                );
+                let span_repaired = repair_span_due_to_frontmatter(span, node);
                 self.wikilinks.push(
                     Wikilink::builder()
                         .alias(alias.clone())
-                        .span(repair_span_due_to_frontmatter(
-                            SourceSpan::new(
-                                (SourceOffset::from_location(
-                                    remove_frontmatter_from_source(source, node),
-                                    sourcepos.start.line,
-                                    sourcepos.start.column,
-                                )
-                                .offset()
-                                    + captures
-                                        .get(1)
-                                        .expect("The regex has 2 capture groups")
-                                        .start())
-                                .into(),
-                                alias.len(),
-                            ),
-                            node,
-                        ))
+                        .span(span_repaired)
                         .build(),
                 );
             }
diff --git a/src/rules/unlinked_text.rs b/src/rules/unlinked_text.rs
index 17552ee..f10b9f4 100644
--- a/src/rules/unlinked_text.rs
+++ b/src/rules/unlinked_text.rs
@@ -181,20 +181,20 @@ impl Visitor for UnlinkedTextVisitor {
                     continue;
                 }
                 let alias = Alias::new(&patterns[found.pattern().as_usize()]);
-                let span = repair_span_due_to_frontmatter(
-                    SourceSpan::new(
-                        (SourceOffset::from_location(
-                            remove_frontmatter_from_source(source, node),
-                            sourcepos.start.line,
-                            sourcepos.start.column,
-                        )
-                        .offset()
-                            + found.start())
-                        .into(),
-                        found.end() - found.start(),
-                    ),
-                    node,
-                );
+                if "lorem" == alias.to_string() {
+                    println!("Found lorem");
+                }
+                let text_without_frontmatter = remove_frontmatter_from_source(source, node);
+                let sourcepos_start_offset_bytes = SourceOffset::from_location(
+                    text_without_frontmatter,
+                    sourcepos.start.line,
+                    sourcepos.start.column,
+                )
+                .offset();
+                let byte_length = found.end() - found.start();
+                let offset_bytes = sourcepos_start_offset_bytes + found.start();
+                let span = SourceSpan::new(offset_bytes.into(), byte_length);
+                let span_repaired = repair_span_due_to_frontmatter(span, node);
 
                 // Dont match inside wikilinks
                 if let Some(parent) = parent {
@@ -204,7 +204,7 @@ impl Visitor for UnlinkedTextVisitor {
                     }
                 }
 
-                self.new_unlinked_texts.push((alias, span));
+                self.new_unlinked_texts.push((alias, span_repaired));
             }
         }
         Ok(())

From 134ac04c8b180be267fd51cb6494e5f4ce51c0a9 Mon Sep 17 00:00:00 2001
From: Ryan Peach <rgpeach10@gmail.com>
Date: Tue, 17 Dec 2024 01:23:37 -0500
Subject: [PATCH 2/3] Now handling multibyte characters

---
 src/visitor.rs | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/visitor.rs b/src/visitor.rs
index 0cb6ba5..5683aa5 100644
--- a/src/visitor.rs
+++ b/src/visitor.rs
@@ -93,6 +93,11 @@ pub enum ParseError {
         #[backtrace]
         source: std::io::Error,
     },
+    #[error("Multibyte characters found in the file {file:?}")]
+    MultibyteError {
+        file: PathBuf,
+        backtrace: backtrace::Backtrace,
+    },
     #[error("Error parsing the source code for file {file:?} using tree-sitter")]
     TreeSitter {
         file: PathBuf,
@@ -119,6 +124,15 @@ pub fn parse(path: &PathBuf, visitors: Vec<Rc<RefCell<dyn Visitor>>>) -> Result<
         file: path.clone(),
         source,
     })?;
+
+    // Check for multibyte characters
+    if source.chars().count() != source.len() {
+        return Err(ParseError::MultibyteError {
+            file: path.clone(),
+            backtrace: backtrace::Backtrace::force_capture(),
+        });
+    }
+
     // Parse the source code
     let arena = Arena::new();
     let options = ExtensionOptionsBuilder::default()

From c0b8030caafb596046384de0f6ba7247aa232f9f Mon Sep 17 00:00:00 2001
From: Ryan Peach <rgpeach10@gmail.com>
Date: Tue, 17 Dec 2024 01:27:49 -0500
Subject: [PATCH 3/3] Got rid of multibyte characters in test assets

---
 tests/logseq/unlinked_text/assets/journals/2024_08_10.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/logseq/unlinked_text/assets/journals/2024_08_10.md b/tests/logseq/unlinked_text/assets/journals/2024_08_10.md
index b1f3891..058ed81 100644
--- a/tests/logseq/unlinked_text/assets/journals/2024_08_10.md
+++ b/tests/logseq/unlinked_text/assets/journals/2024_08_10.md
@@ -1,6 +1,6 @@
 - [[Tvtinl]]
     - Axndco/Pbyudhm
-        - Zqxzn Avdxyxofu’m
+        - Zqxzn Avdxyxofu'm
     - Vgrjsat
     - Foueh eypvsd?
     - Xotw wlhcgrryrj
@@ -8,8 +8,8 @@
     - Siuh lnmcnlv
     - Jztvzm 21 pdyhwjzvnx
 - Hfxeno aq tmnz d lecc ymdeuz
-    - Lh cy b brkbkgmtz frg ksocv gomu yh mvr’fw ysnt doeun bzim, eka aclmy vrpa hys oz tuk mpb nlle lyn icazyvey ik my peub xr
+    - Lh cy b brkbkgmtz frg ksocv gomu yh mvr'fw ysnt doeun bzim, eka aclmy vrpa hys oz tuk mpb nlle lyn icazyvey ik my peub xr
 - [[Mek Xvmr]]
-    - “Usp tcr itlo cb zsez hjmrou ler f pdv”
+    - "Usp tcr itlo cb zsez hjmrou ler f pdv"
     - Pav bvqv egx ic jxa rto dd amr nwd. Hwt sq vabt kyzk hhx yp, tb j qyt cebfd:
-    - Jjyyvwrro xkbj acor uzifhktovnah hfbyv, udh hluq hvv jx uoua’o whnp ik wxagb knyiyds
+    - Jjyyvwrro xkbj acor uzifhktovnah hfbyv, udh hluq hvv jx uoua'o whnp ik wxagb knyiyds