svilupp · splendidbug · Mar 19, 2024 · Mar 19, 2024 · Mar 20, 2024 · Mar 24, 2024
diff --git a/DocsScraper/Project.toml b/DocsScraper/Project.toml
@@ -0,0 +1,10 @@
+name = "DocsScraper"
+uuid = "6a596b1c-bad6-44d4-a29c-1a7b4368ba96"
+authors = ["Shreyas Agrawal <[email protected]>"]
+version = "0.1.0"
+
+[deps]
+AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
+Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a"
+HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
+URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4"
diff --git a/DocsScraper/README.md b/DocsScraper/README.md
@@ -0,0 +1,24 @@
+# DocsScraper: "A document scraping and parsing tool used to create a custom RAG database for AIHelpMe.jl"
+
+This tool is used to collect and parse Julia's extensive documentation.
+
+## Requirements
+
+```julia
+using Pkg
+Pkg.add("HTTP")
+Pkg.add("Gumbo")
+Pkg.add("AbstractTrees")
+Pkg.add("URIs")
+```
+
+## Usage
+1. **Basic Usage**:
+```julia
+   parsed_text = parse_url("https://docs.julialang.org/en/v1/base/multi-threading/")
+```
+
+## How it works
+```parse_url(url::String)``` extracts the base URL and recursively parses the URL so that all the inner lying text and code is returned in the form of a Vector of Dict along with each text/code's metadata.
+
+Please note that this is merely a pre-release and more work needs to be done
diff --git a/DocsScraper/src/DocsScraper.jl b/DocsScraper/src/DocsScraper.jl
@@ -0,0 +1,152 @@
+"""
+    get_base_url(url::String)
+
+Extracts the base url.
+
+# Arguments
+- `url`: The url string of which, the base url needs to be extracted
+"""
+function get_base_url(url::String)
+    parsed_url = URIs.URI(url)
+    base_url = string(parsed_url.scheme, "://", parsed_url.host, parsed_url.port != nothing ? ":" * string(parsed_url.port) : "", parsed_url.path)
+    return base_url
+end
+
+
+"""
+    process_nodes(node, heading_hierarchy::Dict, parsed_text::Vector)
+
+Process the nodes recursively. The function recursively calls itself for every HTML hierarchy, thereby going deeper to retrieve the text.
+
+# Arguments
+- `node`: The root HTML node 
+- `heading_hierarchy`: Dictionary used to store metadata
+- `parsed_text`: Vector of Dicts to store parsed text and metadata
+
+# TODO:
+- Inline code blocks are the ones present inside <p> or <li>. put `` around the inline code blocks. Make changes inside `elseif tag_name == "p" || tag_name == "li"`
+"""
+function process_nodes(node, heading_hierarchy::Dict, parsed_text::Vector)
+    # if a node is an html element
+    if typeof(node) <: Gumbo.HTMLElement
+
+        tag_name = String(Gumbo.tag(node))
+
+        # process headings. Working: If the current tag is a heading, then, we remove all the smaller headings we stored (for metadata) from  
+        # heading_hierarchy dictionary and add the current heading tag
+        # This is done because, in the heading hierarchy, if we come back to the current heading, 
+        # then there is no need for the smaller ehadings we encountered earlier
+        if startswith(tag_name, "h") && isdigit(last(tag_name))
+            # Clear headings of equal or lower level
+            for k in collect(keys(heading_hierarchy))
+                if Base.parse(Int, last(k)) >= Base.parse(Int, last(tag_name))
+                    delete!(heading_hierarchy, k)
+                end
+            end
+            heading_hierarchy[tag_name] = strip(Gumbo.text(node))
+            push!(parsed_text, Dict("heading" => strip(Gumbo.text(node)), "metadata" => copy(heading_hierarchy)))
+
+            # if the current tag is <code>, then get it's textual value and store in the dictionary with key as "code"
+        elseif tag_name == "code"
+            # Start a new code block
+            code_content = strip(Gumbo.text(node))
+            push!(parsed_text, Dict("code" => code_content, "metadata" => copy(heading_hierarchy)))
+            return ""
+
+            # if the current tag is <p> or <li>, then store the text of the whole tag 
+        elseif tag_name == "p" || tag_name == "li"
+            return strip(Gumbo.text(node))
+
+            # if it's any other tag, then recursively call process_nodes fucntion to go deeper in HTML hierarchy
+        else
+            # Recursively process child nodes for text content, appending text for code blocks differently
+            for child in AbstractTrees.children(node)
+
+                recieved_text = process_nodes(child, heading_hierarchy, parsed_text)
+
+                if !isempty(strip(recieved_text))
+                    push!(parsed_text, Dict("text" => strip(recieved_text), "metadata" => copy(heading_hierarchy)))
+                end
+            end
+
+
+        end
+
+        # if a node is HTMLText, return it's text
+    elseif node isa Gumbo.HTMLText
+        return strip(Gumbo.text(node))
+    end
+
+    return ""
+end
+
+"""
+    parse_url(url::String)
+
+Initiator and main function to parse HTML from url
+
+# Arguments
+- `url`: URL string to parse
+
+# Returns
+- A Vector of Dict containing Heading/Text/Code along with a Dict of respective metadata
+
+# Usage
+parsed_text = parse_url("https://docs.julialang.org/en/v1/base/multi-threading/")
+
+# Example
+Let the HTML be:
+<!DOCTYPE html>
+    <html>
+    <body>
+
+    <h1>Heading 1</h1>
+        <h2>Heading 2</h2>
+            <p>para 1</p>
+            <h3>Heading 3</h3>
+                <code>this is my code block</code>
+            <h3>This is another h3 under Heading 2</h3>
+                <p>This is a paragraph with <code>inline code</code></p>
+
+        <h2>Heading 2_2</h2>
+            <p>para ewg</p>
+
+    </body>
+    </html>
+
+Output: 
+Any[
+    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1"), "heading" => "Heading 1")
+    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2"), "heading" => "Heading 2")
+    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2"), "text" => "para 1")
+    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "Heading 3", "h2" => "Heading 2"), "heading" => "Heading 3")
+    Dict{String, Any}("code" => "this is my code block", "metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "Heading 3", "h2" => "Heading 2"))
+    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "This is another h3 under Heading 2", "h2" => "Heading 2"), "heading" => "This is another h3 under Heading 2")
+    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "This is another h3 under Heading 2", "h2" => "Heading 2"), "text" => "This is a paragraph with  inline code")
+    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2_2"), "heading" => "Heading 2_2")
+    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2_2"), "text" => "para ewg")
+]
+
+# TODO:
+- Input should be Vector of URL strings to parse multiple URLs
+- Use multithreading to simultaneously parse multiple URLs
+"""
+
+
+function parse_url(url::String)
+
+    base_url = get_base_url(url)
+    r = HTTP.get(base_url)
+    r_parsed = parsehtml(String(r.body))
+    # Getting title of the document 
+    title = [el for el in AbstractTrees.PreOrderDFS(r_parsed.root)
+             if el isa HTMLElement && tag(el) == :title] .|> text |> Base.Fix2(join, " / ")
+
+    content_ = [el for el in AbstractTrees.PreOrderDFS(r_parsed.root)
+                if el isa HTMLElement && getattr(el, "class", nothing) == "content"] |> only
+
+    parsed_text = []
+    heading_hierarchy = Dict()
+    process_nodes(content_, heading_hierarchy, parsed_text)
+    return parsed_text
+end