From 9bef4045fc8299bb74daa40b8b88e09b9929de10 Mon Sep 17 00:00:00 2001 From: Ryan Peach Date: Tue, 17 Dec 2024 01:18:44 -0500 Subject: [PATCH 1/3] Changes from https://github.com/ryanpeach/mdlinker/pull/54 --- DEBUGGING.md | 7 +++++ Justfile | 10 +++++++ bin/byte_index | 52 ++++++++++++++++++++++++++++++++++++ src/file/content/wikilink.rs | 36 ++++++++++++------------- src/rules/unlinked_text.rs | 30 ++++++++++----------- 5 files changed, 102 insertions(+), 33 deletions(-) create mode 100644 DEBUGGING.md create mode 100755 bin/byte_index diff --git a/DEBUGGING.md b/DEBUGGING.md new file mode 100644 index 0000000..a7b1c34 --- /dev/null +++ b/DEBUGGING.md @@ -0,0 +1,7 @@ +# rust-lldb + +https://dev.to/bmatcuk/debugging-rust-with-rust-lldb-j1f + +Run `just test-debug` to run the tests in debug mode using lldb. + +Now use `r ` to run a specific test. https://users.rust-lang.org/t/running-a-single-test-under-a-debugger/44460 diff --git a/Justfile b/Justfile index c282b22..7601a70 100644 --- a/Justfile +++ b/Justfile @@ -4,3 +4,13 @@ test: test-print test_name: RUNNING_TESTS=true RUST_LOG=trace RUST_BACKTRACE=1 cargo test -- --test-threads=1 {{test_name}} + +[macos] +test-debug test_name breakpoint: + #!/bin/bash + TEST_OUTPUT=$(RUNNING_TESTS=true cargo test --no-run 2>&1 >/dev/null) + DEP1=$(echo $TEST_OUTPUT | grep -ohe 'Executable tests/logseq/main.rs (target/debug/deps/logseq-[a-z0-9]*' | awk -F'[()]' '{print $2}') + echo $DEP1 + RUNNING_TESTS=true RUST_LOG=debug RUST_BACKTRACE=full rust-lldb $DEP1 \ + -o "b {{breakpoint}}" \ + -o "r {{test_name}}" diff --git a/bin/byte_index b/bin/byte_index new file mode 100755 index 0000000..607b4ed --- /dev/null +++ b/bin/byte_index @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 + +import argparse + +def find_byte_indexes(text, search_term): + # Encode both the text and the search term into bytes + byte_text = text.encode('utf-8') + byte_search_term = search_term.encode('utf-8') + + # Initialize a list to store the byte indexes + byte_indexes = [] + + # Start searching for the term in the byte_text + index = byte_text.find(byte_search_term) + while index != -1: + byte_indexes.append(index) + # Continue searching after the current match + index = byte_text.find(byte_search_term, index + 1) + + return byte_indexes + +def main(): + # Set up argument parser + parser = argparse.ArgumentParser(description="Find byte indexes of a search term in a file.") + parser.add_argument("file", help="Path to the file to be searched") + parser.add_argument("search_term", help="The term to search for in the file") + + # Parse the arguments + args = parser.parse_args() + + # Read the file + try: + with open(args.file, 'r', encoding='utf-8') as f: + file_content = f.read() + except FileNotFoundError: + print(f"Error: File '{args.file}' not found.") + return + except Exception as e: + print(f"Error reading file: {e}") + return + + # Find byte indexes + indexes = find_byte_indexes(file_content, args.search_term) + + # Print the results + if indexes: + print(f"Found '{args.search_term}' at byte indexes: {indexes}") + else: + print(f"'{args.search_term}' not found in the file.") + +if __name__ == "__main__": + main() diff --git a/src/file/content/wikilink.rs b/src/file/content/wikilink.rs index 6199186..78cdf6e 100644 --- a/src/file/content/wikilink.rs +++ b/src/file/content/wikilink.rs @@ -34,7 +34,7 @@ impl Alias { self.0.is_empty() } #[must_use] - pub fn len(&self) -> usize { + pub fn char_len(&self) -> usize { self.0.chars().count() } } @@ -104,26 +104,26 @@ impl Visitor for WikilinkVisitor { .expect("Otherwise the regex wouldn't match") .as_str(), ); + let capture_start_byte = captures + .get(1) + .expect("The regex has 2 capture groups") + .start(); + let text_without_frontmatter = remove_frontmatter_from_source(source, node); + let sourcepos_start_offset_bytes = SourceOffset::from_location( + text_without_frontmatter, + sourcepos.start.line, + sourcepos.start.column, + ) + .offset(); + let span = SourceSpan::new( + (sourcepos_start_offset_bytes + capture_start_byte).into(), + alias.char_len(), + ); + let span_repaired = repair_span_due_to_frontmatter(span, node); self.wikilinks.push( Wikilink::builder() .alias(alias.clone()) - .span(repair_span_due_to_frontmatter( - SourceSpan::new( - (SourceOffset::from_location( - remove_frontmatter_from_source(source, node), - sourcepos.start.line, - sourcepos.start.column, - ) - .offset() - + captures - .get(1) - .expect("The regex has 2 capture groups") - .start()) - .into(), - alias.len(), - ), - node, - )) + .span(span_repaired) .build(), ); } diff --git a/src/rules/unlinked_text.rs b/src/rules/unlinked_text.rs index 17552ee..f10b9f4 100644 --- a/src/rules/unlinked_text.rs +++ b/src/rules/unlinked_text.rs @@ -181,20 +181,20 @@ impl Visitor for UnlinkedTextVisitor { continue; } let alias = Alias::new(&patterns[found.pattern().as_usize()]); - let span = repair_span_due_to_frontmatter( - SourceSpan::new( - (SourceOffset::from_location( - remove_frontmatter_from_source(source, node), - sourcepos.start.line, - sourcepos.start.column, - ) - .offset() - + found.start()) - .into(), - found.end() - found.start(), - ), - node, - ); + if "lorem" == alias.to_string() { + println!("Found lorem"); + } + let text_without_frontmatter = remove_frontmatter_from_source(source, node); + let sourcepos_start_offset_bytes = SourceOffset::from_location( + text_without_frontmatter, + sourcepos.start.line, + sourcepos.start.column, + ) + .offset(); + let byte_length = found.end() - found.start(); + let offset_bytes = sourcepos_start_offset_bytes + found.start(); + let span = SourceSpan::new(offset_bytes.into(), byte_length); + let span_repaired = repair_span_due_to_frontmatter(span, node); // Dont match inside wikilinks if let Some(parent) = parent { @@ -204,7 +204,7 @@ impl Visitor for UnlinkedTextVisitor { } } - self.new_unlinked_texts.push((alias, span)); + self.new_unlinked_texts.push((alias, span_repaired)); } } Ok(()) From 134ac04c8b180be267fd51cb6494e5f4ce51c0a9 Mon Sep 17 00:00:00 2001 From: Ryan Peach Date: Tue, 17 Dec 2024 01:23:37 -0500 Subject: [PATCH 2/3] Now handling multibyte characters --- src/visitor.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/visitor.rs b/src/visitor.rs index 0cb6ba5..5683aa5 100644 --- a/src/visitor.rs +++ b/src/visitor.rs @@ -93,6 +93,11 @@ pub enum ParseError { #[backtrace] source: std::io::Error, }, + #[error("Multibyte characters found in the file {file:?}")] + MultibyteError { + file: PathBuf, + backtrace: backtrace::Backtrace, + }, #[error("Error parsing the source code for file {file:?} using tree-sitter")] TreeSitter { file: PathBuf, @@ -119,6 +124,15 @@ pub fn parse(path: &PathBuf, visitors: Vec>>) -> Result< file: path.clone(), source, })?; + + // Check for multibyte characters + if source.chars().count() != source.len() { + return Err(ParseError::MultibyteError { + file: path.clone(), + backtrace: backtrace::Backtrace::force_capture(), + }); + } + // Parse the source code let arena = Arena::new(); let options = ExtensionOptionsBuilder::default() From c0b8030caafb596046384de0f6ba7247aa232f9f Mon Sep 17 00:00:00 2001 From: Ryan Peach Date: Tue, 17 Dec 2024 01:27:49 -0500 Subject: [PATCH 3/3] Got rid of multibyte characters in test assets --- tests/logseq/unlinked_text/assets/journals/2024_08_10.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/logseq/unlinked_text/assets/journals/2024_08_10.md b/tests/logseq/unlinked_text/assets/journals/2024_08_10.md index b1f3891..058ed81 100644 --- a/tests/logseq/unlinked_text/assets/journals/2024_08_10.md +++ b/tests/logseq/unlinked_text/assets/journals/2024_08_10.md @@ -1,6 +1,6 @@ - [[Tvtinl]] - Axndco/Pbyudhm - - Zqxzn Avdxyxofu’m + - Zqxzn Avdxyxofu'm - Vgrjsat - Foueh eypvsd? - Xotw wlhcgrryrj @@ -8,8 +8,8 @@ - Siuh lnmcnlv - Jztvzm 21 pdyhwjzvnx - Hfxeno aq tmnz d lecc ymdeuz - - Lh cy b brkbkgmtz frg ksocv gomu yh mvr’fw ysnt doeun bzim, eka aclmy vrpa hys oz tuk mpb nlle lyn icazyvey ik my peub xr + - Lh cy b brkbkgmtz frg ksocv gomu yh mvr'fw ysnt doeun bzim, eka aclmy vrpa hys oz tuk mpb nlle lyn icazyvey ik my peub xr - [[Mek Xvmr]] - - “Usp tcr itlo cb zsez hjmrou ler f pdv” + - "Usp tcr itlo cb zsez hjmrou ler f pdv" - Pav bvqv egx ic jxa rto dd amr nwd. Hwt sq vabt kyzk hhx yp, tb j qyt cebfd: - - Jjyyvwrro xkbj acor uzifhktovnah hfbyv, udh hluq hvv jx uoua’o whnp ik wxagb knyiyds + - Jjyyvwrro xkbj acor uzifhktovnah hfbyv, udh hluq hvv jx uoua'o whnp ik wxagb knyiyds