Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DocsScraper sub repo #5

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions DocsScraper/Project.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
name = "DocsScraper"
uuid = "6a596b1c-bad6-44d4-a29c-1a7b4368ba96"
authors = ["Shreyas Agrawal <[email protected]>"]
version = "0.1.0"

[deps]
AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a"
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4"
splendidbug marked this conversation as resolved.
Show resolved Hide resolved
24 changes: 24 additions & 0 deletions DocsScraper/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# DocsScraper: "A document scraping and parsing tool used to create a custom RAG database for AIHelpMe.jl"

This tool is used to collect and parse Julia's extensive documentation.

## Requirements

```julia
using Pkg
Pkg.add("HTTP")
Pkg.add("Gumbo")
Pkg.add("AbstractTrees")
Pkg.add("URIs")
```

## Usage
1. **Basic Usage**:
```julia
parsed_text = parse_url("https://docs.julialang.org/en/v1/base/multi-threading/")
```

## How it works
```parse_url(url::String)``` extracts the base URL and recursively parses the URL so that all the inner lying text and code is returned in the form of a Vector of Dict along with each text/code's metadata.

Please note that this is merely a pre-release and more work needs to be done
152 changes: 152 additions & 0 deletions DocsScraper/src/DocsScraper.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
"""
get_base_url(url::String)

Extracts the base url.

# Arguments
- `url`: The url string of which, the base url needs to be extracted
"""
function get_base_url(url::String)
splendidbug marked this conversation as resolved.
Show resolved Hide resolved
parsed_url = URIs.URI(url)
base_url = string(parsed_url.scheme, "://", parsed_url.host, parsed_url.port != nothing ? ":" * string(parsed_url.port) : "", parsed_url.path)
return base_url
end


"""
process_nodes(node, heading_hierarchy::Dict, parsed_text::Vector)

Process the nodes recursively. The function recursively calls itself for every HTML hierarchy, thereby going deeper to retrieve the text.

# Arguments
- `node`: The root HTML node
- `heading_hierarchy`: Dictionary used to store metadata
- `parsed_text`: Vector of Dicts to store parsed text and metadata

# TODO:
- Inline code blocks are the ones present inside <p> or <li>. put `` around the inline code blocks. Make changes inside `elseif tag_name == "p" || tag_name == "li"`
"""
function process_nodes(node, heading_hierarchy::Dict, parsed_text::Vector)
splendidbug marked this conversation as resolved.
Show resolved Hide resolved
splendidbug marked this conversation as resolved.
Show resolved Hide resolved
# if a node is an html element
if typeof(node) <: Gumbo.HTMLElement
splendidbug marked this conversation as resolved.
Show resolved Hide resolved

tag_name = String(Gumbo.tag(node))

# process headings. Working: If the current tag is a heading, then, we remove all the smaller headings we stored (for metadata) from
# heading_hierarchy dictionary and add the current heading tag
# This is done because, in the heading hierarchy, if we come back to the current heading,
# then there is no need for the smaller ehadings we encountered earlier
if startswith(tag_name, "h") && isdigit(last(tag_name))
# Clear headings of equal or lower level
for k in collect(keys(heading_hierarchy))
if Base.parse(Int, last(k)) >= Base.parse(Int, last(tag_name))
delete!(heading_hierarchy, k)
end
end
heading_hierarchy[tag_name] = strip(Gumbo.text(node))
push!(parsed_text, Dict("heading" => strip(Gumbo.text(node)), "metadata" => copy(heading_hierarchy)))

# if the current tag is <code>, then get it's textual value and store in the dictionary with key as "code"
elseif tag_name == "code"
# Start a new code block
code_content = strip(Gumbo.text(node))
push!(parsed_text, Dict("code" => code_content, "metadata" => copy(heading_hierarchy)))
return ""

# if the current tag is <p> or <li>, then store the text of the whole tag
elseif tag_name == "p" || tag_name == "li"
splendidbug marked this conversation as resolved.
Show resolved Hide resolved
return strip(Gumbo.text(node))

# if it's any other tag, then recursively call process_nodes fucntion to go deeper in HTML hierarchy
else
# Recursively process child nodes for text content, appending text for code blocks differently
for child in AbstractTrees.children(node)

recieved_text = process_nodes(child, heading_hierarchy, parsed_text)
splendidbug marked this conversation as resolved.
Show resolved Hide resolved

if !isempty(strip(recieved_text))
splendidbug marked this conversation as resolved.
Show resolved Hide resolved
push!(parsed_text, Dict("text" => strip(recieved_text), "metadata" => copy(heading_hierarchy)))
splendidbug marked this conversation as resolved.
Show resolved Hide resolved
end
end


end

# if a node is HTMLText, return it's text
elseif node isa Gumbo.HTMLText
return strip(Gumbo.text(node))
end

return ""
end

"""
parse_url(url::String)

Initiator and main function to parse HTML from url

# Arguments
- `url`: URL string to parse

# Returns
- A Vector of Dict containing Heading/Text/Code along with a Dict of respective metadata

# Usage
parsed_text = parse_url("https://docs.julialang.org/en/v1/base/multi-threading/")

# Example
Let the HTML be:
<!DOCTYPE html>
<html>
<body>

<h1>Heading 1</h1>
<h2>Heading 2</h2>
<p>para 1</p>
<h3>Heading 3</h3>
<code>this is my code block</code>
<h3>This is another h3 under Heading 2</h3>
<p>This is a paragraph with <code>inline code</code></p>

<h2>Heading 2_2</h2>
<p>para ewg</p>

</body>
</html>

Output:
Any[
Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1"), "heading" => "Heading 1")
Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2"), "heading" => "Heading 2")
Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2"), "text" => "para 1")
Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "Heading 3", "h2" => "Heading 2"), "heading" => "Heading 3")
Dict{String, Any}("code" => "this is my code block", "metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "Heading 3", "h2" => "Heading 2"))
Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "This is another h3 under Heading 2", "h2" => "Heading 2"), "heading" => "This is another h3 under Heading 2")
Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "This is another h3 under Heading 2", "h2" => "Heading 2"), "text" => "This is a paragraph with inline code")
Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2_2"), "heading" => "Heading 2_2")
Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2_2"), "text" => "para ewg")
]

# TODO:
- Input should be Vector of URL strings to parse multiple URLs
- Use multithreading to simultaneously parse multiple URLs
"""

splendidbug marked this conversation as resolved.
Show resolved Hide resolved

function parse_url(url::String)

base_url = get_base_url(url)
r = HTTP.get(base_url)
r_parsed = parsehtml(String(r.body))
# Getting title of the document
title = [el for el in AbstractTrees.PreOrderDFS(r_parsed.root)
if el isa HTMLElement && tag(el) == :title] .|> text |> Base.Fix2(join, " / ")

content_ = [el for el in AbstractTrees.PreOrderDFS(r_parsed.root)
if el isa HTMLElement && getattr(el, "class", nothing) == "content"] |> only

parsed_text = []
heading_hierarchy = Dict()
process_nodes(content_, heading_hierarchy, parsed_text)
return parsed_text
end