Skip to content

Commit

Permalink
Handle Wikipedia code blocks in /fetch command (zed-industries#12780)
Browse files Browse the repository at this point in the history
This PR extends the `/fetch` command with support for Wikipedia code
blocks.

Release Notes:

- N/A
  • Loading branch information
maxdeviant authored Jun 7, 2024
1 parent 9174858 commit 834089f
Show file tree
Hide file tree
Showing 3 changed files with 107 additions and 4 deletions.
5 changes: 4 additions & 1 deletion crates/assistant/src/slash_command/fetch_command.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,15 @@ impl FetchSlashCommand {
Box::new(markdown::ListHandler),
Box::new(markdown::TableHandler::new()),
Box::new(markdown::StyledTextHandler),
Box::new(markdown::CodeHandler),
];
if url.contains("wikipedia.org") {
use html_to_markdown::structure::wikipedia;

handlers.push(Box::new(wikipedia::WikipediaChromeRemover));
handlers.push(Box::new(wikipedia::WikipediaInfoboxHandler));
handlers.push(Box::new(wikipedia::WikipediaCodeHandler::new()));
} else {
handlers.push(Box::new(markdown::CodeHandler));
}

convert_html_to_markdown(&body[..], handlers)
Expand Down
2 changes: 1 addition & 1 deletion crates/html_to_markdown/src/markdown_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ impl MarkdownWriter {
}

let text = text
.trim_matches(|char| char == '\n' || char == '\r')
.trim_matches(|char| char == '\n' || char == '\r' || char == '\t')
.replace('\n', " ");

self.push_str(&text);
Expand Down
104 changes: 102 additions & 2 deletions crates/html_to_markdown/src/structure/wikipedia.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use crate::html_element::HtmlElement;
use crate::markdown_writer::{MarkdownWriter, StartTagOutcome};
use crate::markdown_writer::{HandlerOutcome, MarkdownWriter, StartTagOutcome};
use crate::HandleTag;

pub struct WikipediaChromeRemover;
Expand Down Expand Up @@ -30,7 +30,7 @@ impl HandleTag for WikipediaChromeRemover {
return StartTagOutcome::Skip;
}

let classes_to_skip = ["mw-editsection", "mw-jump-link"];
let classes_to_skip = ["noprint", "mw-editsection", "mw-jump-link"];
if tag.has_any_classes(&classes_to_skip) {
return StartTagOutcome::Skip;
}
Expand All @@ -42,6 +42,106 @@ impl HandleTag for WikipediaChromeRemover {
}
}

pub struct WikipediaInfoboxHandler;

impl HandleTag for WikipediaInfoboxHandler {
fn should_handle(&self, tag: &str) -> bool {
tag == "table"
}

fn handle_tag_start(
&mut self,
tag: &HtmlElement,
_writer: &mut MarkdownWriter,
) -> StartTagOutcome {
match tag.tag.as_str() {
"table" => {
if tag.has_class("infobox") {
return StartTagOutcome::Skip;
}
}
_ => {}
}

StartTagOutcome::Continue
}
}

pub struct WikipediaCodeHandler {
language: Option<String>,
}

impl WikipediaCodeHandler {
pub fn new() -> Self {
Self { language: None }
}
}

impl HandleTag for WikipediaCodeHandler {
fn should_handle(&self, tag: &str) -> bool {
match tag {
"div" | "pre" | "code" => true,
_ => false,
}
}

fn handle_tag_start(
&mut self,
tag: &HtmlElement,
writer: &mut MarkdownWriter,
) -> StartTagOutcome {
match tag.tag.as_str() {
"code" => {
if !writer.is_inside("pre") {
writer.push_str("`");
}
}
"div" => {
let classes = tag.classes();
self.language = classes.iter().find_map(|class| {
if let Some((_, language)) = class.split_once("mw-highlight-lang-") {
Some(language.trim().to_owned())
} else {
None
}
});
}
"pre" => {
writer.push_blank_line();
writer.push_str("```");
if let Some(language) = self.language.take() {
writer.push_str(&language);
}
writer.push_newline();
}
_ => {}
}

StartTagOutcome::Continue
}

fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
match tag.tag.as_str() {
"code" => {
if !writer.is_inside("pre") {
writer.push_str("`");
}
}
"pre" => writer.push_str("\n```\n"),
_ => {}
}
}

fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
if writer.is_inside("pre") {
writer.push_str(&text);
return HandlerOutcome::Handled;
}

HandlerOutcome::NoOp
}
}

#[cfg(test)]
mod tests {
use indoc::indoc;
Expand Down

0 comments on commit 834089f

Please sign in to comment.