From 4b6ced7ba94ff4e89715ee64a676fd6fc0568591 Mon Sep 17 00:00:00 2001 From: Ryan Peach Date: Sat, 21 Dec 2024 14:11:53 -0500 Subject: [PATCH] Added ignore pairs and updated readme --- Cargo.toml | 2 +- Readme.md | 53 ++++++++++++++++++++++++++--------- src/config.rs | 5 ++++ src/config/cli.rs | 3 ++ src/config/file.rs | 16 ++++++++--- src/lib.rs | 1 + src/rules/similar_filename.rs | 11 ++++++++ 7 files changed, 72 insertions(+), 19 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 18089c1..7d53eb0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "mdlinker" -version = "1.4.0" +version = "1.5.0" edition = "2021" [profile.dev] diff --git a/Readme.md b/Readme.md index 394e170..bc8c3f9 100644 --- a/Readme.md +++ b/Readme.md @@ -1,6 +1,6 @@ # MdLinker -A linter whose goal is to lint wikilinks in a variety of markdown note taking apps to enable maximal networked thinking. +A linter whose goal is to lint wikilinks in a variety of markdown note taking apps to enable maximal networked thinking. Currently supports [logseq](https://logseq.com/). Is fundamentally designed to aide in the [zettelkasten method](https://zettelkasten.de/overview/) of note taking. Uses [miette](https://github.com/zkat/miette) for beautiful :crab: rust style error messages. :crab: @@ -31,7 +31,39 @@ Linking works best when you spell things correctly, in both your filenames and f Put a `mdlinker.toml` in your project root to configure the linter. -Options are defined in [`src/config/file.rs`](src/config/file.rs) as a serde object, and can be overwritten in the cli, see `mdlinker --help`. +Options are defined in [`src/config/file.rs`](src/config/file.rs) as a serde object, and can be overwritten in the cli, see `mdlinker --help` and the docstrings for full details. + +```toml +# This is the folder where filenames which represent linkable words go. +pages_directory = "pages" + +# These are any other folders you wish to scan. These can't be linked to. Usually these are things which link to items in the pages_directory. +other_directories = ["journal", "notes"] + +# Exclusions +# This is how you silence specific rules or instances of errors +# It accepts glob patterns +exclude = [ + "rule:category:*", + "rule:category:error:id:as:found:in:the:error:output", + "..." +] +ignore_word_pairs = [ + ["foo", "foobar"], +] # These are pairs of words which look similar in your filenames but are not the same. Suppresses SimilarFilename rule. + +# The Similar Filename rule can match on n_grams, like "Barrack Obama". But in order to do this, you need to set the max number of words in an ngram. +# You really don't need to change any of these +ngram_size = 3 +boundary_pattern = r"___" # This is a regex pattern to match on filenames to stop ngram generation (like at a hierarchy or sentence boundary). In logseq this is represented with three underscores. +filename_match_threshold = 100 # This is the similarity threshold for the similar filename rule. It is an integer corresponding to the output of the [fuzzy-matcher](https://github.com/skim-rs/fuzzy-matcher) crate. +filename_spacing_pattern = "-|_|\s" # This is a regex pattern to split filenames into words. It is used for the ngram generation. + +# Compatibility +# These are options that are meant to help us eventually prototype this system for other tools like obsidian. They convert filenames in the "pages_directory" to aliases, and aliases to filenames in the "pages_directory". Do not change these unless you know what you are doing. +filename_to_alias = ["___", "/"] +alias_to_filename = ["/", "___"] +``` # Lint Rules @@ -39,16 +71,6 @@ Options are defined in [`src/config/file.rs`](src/config/file.rs) as a serde obj - [X] Duplicate Alias: If using something like [logseq aliases](https://unofficial-logseq-docs.gitbook.io/unofficial-logseq-docs/beginner-to-advance-features/aliases), make sure they are always unique (also compares them to filenames). - [X] Broken Wikilink: Some wikilinks linked resource does not exist. Maybe you should create the page, or maybe the link title is misspelled? - [X] Unlinked Text: Text was detected which is very similar to some file title or alias. Maybe you should wrap it as a link? -- [ ] RelatesTo: At least in logseq, I find it annoying the graph wont show two items as linked if they are linked within the same block. These will identify that and force you to link them in the frontmatter. - - RelatesToParagraph: Two wikilinks in the same paragraph - - RelatesToListItem: Two wikilinks in the same list item - - RelatesToListItemDescendant: One wikilink in a sublist item of another wikilinks listitem - - RelatesToNotFound: delete a relation which doesnt match any of the above for cleanliness - -# Future - -- [ ] LSP Implementation -- [ ] Logseq Plugin # Compatibility @@ -58,8 +80,11 @@ Options are defined in [`src/config/file.rs`](src/config/file.rs) as a serde obj - [X] Logseq Aliases (in Yaml Front Matter) - [X] `[[url]]` and `[[title|url]]` style wikilinks - [X] #[[url]] and #url tags +- [ ] Links to other files in the "other_directories" - [ ] Marksman [[#url]] tags - [ ] Logseq properties ":: style" (Won't implement, use yaml front matter) -- [ ] Obsidian Folder Structure (Unknown, PRs welcome) -- [ ] Obsidian Aliases (Unknown, PRs welcome) +- [ ] Obsidian Folder Structure (PRs welcome) +- [ ] Obsidian Aliases (PRs welcome) - [ ] [Marksman](https://github.com/artempyanykh/marksman) +- [ ] [Roam](https://roamresearch.com/) +- [ ] [Zettelkasten](https://zettelkasten.de/) diff --git a/src/config.rs b/src/config.rs index 202b40d..295f3ba 100644 --- a/src/config.rs +++ b/src/config.rs @@ -69,6 +69,9 @@ pub struct Config { /// See [`self::cli::Config::allow_dirty`] #[builder(default = false)] pub allow_dirty: bool, + /// See [`self::file::Config::ignore_word_pairs`] + #[builder(default = vec![])] + pub ignore_word_pairs: Vec<(String, String)>, } /// Things which implement the partial config trait @@ -91,6 +94,7 @@ pub trait Partial { ) -> Option, ReplacePairCompilationError>>; fn fix(&self) -> Option; fn allow_dirty(&self) -> Option; + fn ignore_word_pairs(&self) -> Option>; } /// Now we implement a combine function for patrial configs which @@ -124,6 +128,7 @@ fn combine_partials(partials: &[&dyn Partial]) -> Result .ok_or(NewConfigError::PagesDirectoryMissing)?, ) .maybe_other_directories(partials.iter().find_map(|p| p.other_directories())) + .maybe_ignore_word_pairs(partials.iter().find_map(|p| p.ignore_word_pairs())) .build()) } diff --git a/src/config/cli.rs b/src/config/cli.rs index 15f7712..56f9f82 100644 --- a/src/config/cli.rs +++ b/src/config/cli.rs @@ -110,4 +110,7 @@ impl Partial for Config { fn allow_dirty(&self) -> Option { Some(self.allow_dirty) } + fn ignore_word_pairs(&self) -> Option> { + None + } } diff --git a/src/config/file.rs b/src/config/file.rs index ea732c6..8432445 100644 --- a/src/config/file.rs +++ b/src/config/file.rs @@ -29,10 +29,6 @@ pub(super) struct Config { #[serde(default)] pub boundary_pattern: Option, - /// See [`super::cli::Config::wikilink_pattern`] - #[serde(default)] - pub wikilink_pattern: Option, - /// See [`super::cli::Config::filename_spacing_pattern`] #[serde(default)] pub filename_spacing_pattern: Option, @@ -45,6 +41,11 @@ pub(super) struct Config { #[serde(default)] pub exclude: Vec, + /// In the [`crate::rules::similar_filename::SimilarFilename`] rule, ignore certain word pairs + /// Prevents some annoying and frequent false positives + #[serde(default)] + pub ignore_word_pairs: Vec<(String, String)>, + /// Convert an alias to a filename /// Kinda like a sed command #[serde(default)] @@ -139,4 +140,11 @@ impl Partial for Config { fn allow_dirty(&self) -> Option { None } + fn ignore_word_pairs(&self) -> Option> { + if self.ignore_word_pairs.is_empty() { + None + } else { + Some(self.ignore_word_pairs.clone()) + } + } } diff --git a/src/lib.rs b/src/lib.rs index e837a30..edc6ad6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -226,6 +226,7 @@ fn check(config: &config::Config) -> Result { &file_ngrams, config.filename_match_threshold, &filename_spacing_regex, + config, )? .finalize(&config.exclude); reports.extend( diff --git a/src/rules/similar_filename.rs b/src/rules/similar_filename.rs index 78f59c4..493a5bd 100644 --- a/src/rules/similar_filename.rs +++ b/src/rules/similar_filename.rs @@ -146,6 +146,7 @@ impl SimilarFilename { file_ngrams: &HashMap, filename_match_threshold: i64, spacing_regex: &Regex, + config: &Config, ) -> Result, MissingSubstringError> { // Convert all filenames to a single string // Check if any two file ngrams fuzzy match @@ -172,6 +173,16 @@ impl SimilarFilename { continue; } + // TODO: This can be improved computationally using a hashmap + for (a, b) in &config.ignore_word_pairs { + if &ngram.to_string() == a && &other_ngram.to_string() == b { + continue; + } + if &ngram.to_string() == b && &other_ngram.to_string() == a { + continue; + } + } + if let Some(bar) = &file_crosscheck_bar { bar.inc(1); }