another new version

xiaodaigh · May 28, 2021 · 2c4e86c · 2c4e86c · xiaodaigh · May 28, 2021
1 parent 9068881
commit 2c4e86c
Show file tree

Hide file tree

Showing 3 changed files with 24 additions and 8 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "TableScraper"
 uuid = "3d876f86-fca9-45cb-9864-7207416dc431"
 authors = ["ZJ <[email protected]>"]
-version = "0.1.1"
+version = "0.1.2"
 
 [deps]
 Cascadia = "54eefc05-d75b-58de-a785-1a3403f0919f"
@@ -18,6 +18,7 @@ julia = "1"
 
 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 
 [targets]
-test = ["Test"]
+test = ["Test", "DataFrames"]
diff --git a/src/TableScraper.jl b/src/TableScraper.jl
@@ -11,19 +11,25 @@ include("Tables.jl")
 
 """
     scrape_tables(url)
-    scrape_tables(url, cell_transform)
+    scrape_tables(url, cell_transform[=nodeText])
+    scrape_tables(url, cell_transform[=nodeText], header_transform[=nodeText])
 
 This function will scrape `url` for any WELL-FORMED tables wrapped in `<table>` tags and return
 them in a vector.
 
 # Arguments
 
     - `url`: The URL to look for tables
-    - `cell_transform`: By default, each of the table cells wrapped in `<td>` have transformed by
+    - `cell_transform`: By default, each of the table cells wrapped in `<td>` is transformed by
         the callable (i.e. `Function` or type definition) `cell_transform`. The default
         `cell_transform` is `Cascadia.nodeText` which extracts the node's text. You may wish to use
         `identity` to extract just the cell as a `Gumbo.HTMLNode` type for more advanced processing,
         e.g. `scrape_tables(url, identity)`
+    - `header_transform`: By default, each of the table header wrapped in `<th>` is transformed by
+        the callable (i.e. `Function` or type definition) `header_transform`. The default
+        `header_transform` is `Cascadia.nodeText` which extracts the node's text. You may wish to
+        use `identity` to extract just the cell as a `Gumbo.HTMLNode` type for more advanced
+        processing, e.g. `scrape_tables(url, identity, identity)`
 
 # Return
 
@@ -33,7 +39,7 @@ The `TableScraper.Table` is a Tables.jl-compatible row-accessible type. So you c
 another Tables.jl compatible type if you wish e.g. `DataFrame.(scrape_tables(url))` will return a
 vector of `DataFrame`s
 """
-function scrape_tables(url, cell_transform=nodeText)::Vector{Table}
+function scrape_tables(url, cell_transform=nodeText, header_transform=nodeText)::Vector{Table}
     result_tables = []
 
     response::HTTP.Messages.Response =
@@ -59,13 +65,14 @@ function scrape_tables(url, cell_transform=nodeText)::Vector{Table}
     for (header, table_elem) in zip(headers, tables_elems)
         for header1 in eachmatch(sel"tr th", table_elem)
             # check the header span
-            if haskey(header1.attributes, "colspan")
+            # you are on your won if you don't use nodeText
+            if (nodeText == header_transform) & haskey(header1.attributes, "colspan")
                 colspan = parse(Int, header1.attributes["colspan"])
                 for i in 1:colspan
                     push!(header, nodeText(header1)*"$i")
                 end
             else
-                push!(header, nodeText(header1))
+                push!(header, header_transform(header1))
             end
         end
     end

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,6 +1,14 @@
 using TableScraper
+using DataFrames
 using Test
 
 @testset "TableScraper.jl" begin
-    # Write your tests here.
+    table = scrape_tables("https://www.agenas.gov.it/covid19/web/index.php?r=site%2Fprovvedimento&q=010")[1] |> DataFrame;
+
+    @test nrow(table) > 0
+end
+
+@testset "TableScraper.jl goratings" begin
+    table = scrape_tables("https://goratings.org")[2] |> DataFrame;
+    @test nrow(table) > 0
 end