Skip to content

Commit

Permalink
another new version
Browse files Browse the repository at this point in the history
  • Loading branch information
xiaodaigh committed May 28, 2021
1 parent 9068881 commit 2c4e86c
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 8 deletions.
5 changes: 3 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "TableScraper"
uuid = "3d876f86-fca9-45cb-9864-7207416dc431"
authors = ["ZJ <[email protected]>"]
version = "0.1.1"
version = "0.1.2"

[deps]
Cascadia = "54eefc05-d75b-58de-a785-1a3403f0919f"
Expand All @@ -18,6 +18,7 @@ julia = "1"

[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"

[targets]
test = ["Test"]
test = ["Test", "DataFrames"]
17 changes: 12 additions & 5 deletions src/TableScraper.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,25 @@ include("Tables.jl")

"""
scrape_tables(url)
scrape_tables(url, cell_transform)
scrape_tables(url, cell_transform[=nodeText])
scrape_tables(url, cell_transform[=nodeText], header_transform[=nodeText])
This function will scrape `url` for any WELL-FORMED tables wrapped in `<table>` tags and return
them in a vector.
# Arguments
- `url`: The URL to look for tables
- `cell_transform`: By default, each of the table cells wrapped in `<td>` have transformed by
- `cell_transform`: By default, each of the table cells wrapped in `<td>` is transformed by
the callable (i.e. `Function` or type definition) `cell_transform`. The default
`cell_transform` is `Cascadia.nodeText` which extracts the node's text. You may wish to use
`identity` to extract just the cell as a `Gumbo.HTMLNode` type for more advanced processing,
e.g. `scrape_tables(url, identity)`
- `header_transform`: By default, each of the table header wrapped in `<th>` is transformed by
the callable (i.e. `Function` or type definition) `header_transform`. The default
`header_transform` is `Cascadia.nodeText` which extracts the node's text. You may wish to
use `identity` to extract just the cell as a `Gumbo.HTMLNode` type for more advanced
processing, e.g. `scrape_tables(url, identity, identity)`
# Return
Expand All @@ -33,7 +39,7 @@ The `TableScraper.Table` is a Tables.jl-compatible row-accessible type. So you c
another Tables.jl compatible type if you wish e.g. `DataFrame.(scrape_tables(url))` will return a
vector of `DataFrame`s
"""
function scrape_tables(url, cell_transform=nodeText)::Vector{Table}
function scrape_tables(url, cell_transform=nodeText, header_transform=nodeText)::Vector{Table}
result_tables = []

response::HTTP.Messages.Response =
Expand All @@ -59,13 +65,14 @@ function scrape_tables(url, cell_transform=nodeText)::Vector{Table}
for (header, table_elem) in zip(headers, tables_elems)
for header1 in eachmatch(sel"tr th", table_elem)
# check the header span
if haskey(header1.attributes, "colspan")
# you are on your won if you don't use nodeText
if (nodeText == header_transform) & haskey(header1.attributes, "colspan")
colspan = parse(Int, header1.attributes["colspan"])
for i in 1:colspan
push!(header, nodeText(header1)*"$i")
end
else
push!(header, nodeText(header1))
push!(header, header_transform(header1))
end
end
end
Expand Down
10 changes: 9 additions & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
using TableScraper
using DataFrames
using Test

@testset "TableScraper.jl" begin
# Write your tests here.
table = scrape_tables("https://www.agenas.gov.it/covid19/web/index.php?r=site%2Fprovvedimento&q=010")[1] |> DataFrame;

@test nrow(table) > 0
end

@testset "TableScraper.jl goratings" begin
table = scrape_tables("https://goratings.org")[2] |> DataFrame;
@test nrow(table) > 0
end

2 comments on commit 2c4e86c

@xiaodaigh
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/37709

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.1.2 -m "<description of version>" 2c4e86cdff49bde3fc0ff83f5fdeeb1fef97a448
git push origin v0.1.2

Please sign in to comment.