Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
  • Loading branch information
hyee committed Nov 10, 2017
1 parent 03023d9 commit 77c5468
Show file tree
Hide file tree
Showing 4 changed files with 310 additions and 394 deletions.
35 changes: 27 additions & 8 deletions chm.lua
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ local target_doc_root='f:\\BM\\newdoc11\\'
.hhc/.hhk/.hhp files are all created under the root path
.hhc => Content rules(buildJson): target.json
.hhc => Content rules(buildJson): target.db/target.json
.hhk => Index rules(buildIdx):
1. Common books => index.htm:
<dl> -> <dd[class='*ix']>content,<a[href]>
Expand Down Expand Up @@ -39,6 +39,9 @@ local target_doc_root='f:\\BM\\newdoc11\\'
a. Replace '.htm?<parameters>' as '.htm'
b. Caculate the <relative_path> based on the root path and replace '\' as '.', assign as the <file_name>
c. Final address is 'MS-ITS:<file_name>.chm::/<relative_path>/<html_file(#...)?>'
5). For all links from 'a' that starts with 'http', set attribute target="_blank"
6). For the content inside "<footer></footer>", if contains the prev/next navigation, then add the bottom bar
7). For sections that after p="part", move as the children; for sections that p="appendix", move into appendix part
Book list rules: all directories that contains 'toc.htm'
--]]
Expand Down Expand Up @@ -249,7 +252,7 @@ function builder:buildIdx()
local function access_childs(li,level)
if li.name~="li" or not li.nodes[1] then return end
local content=li:getcontent():gsub('^%s+','')
local n={name=content:gsub('[%s,]+<.*>.*$',''),ref={}}
local n={name=content:gsub('[%s,]+<.+>.*$',''),ref={}}
if n.name=="" then return end
if level==1 then
tree[#tree+1],sql_keys[n.name:upper()]=n,nil
Expand All @@ -263,7 +266,9 @@ function builder:buildIdx()
local lis=li:select("li")
if li.nodes[1].name~="ul" then
for _,a in ipairs(li:select("a")) do
n.ref[#n.ref+1]=a.attributes.href
if a.parent==li or (a.parent and a.parent.name=="span" and a.parent.parent==li) then
n.ref[#n.ref+1]=a.attributes.href
end
end

if level>1 and #n.ref==0 and #treenode[level-1].ref==0 then
Expand Down Expand Up @@ -464,9 +469,20 @@ function builder:buildJson()
for i=last,1,-1 do
local node=root.docs[1].c[i]
local p=node.p
if (node.t==node.seq or not node.t) and node.h then
local url=(self.full_dir..node.h):gsub("(html?)#.+$","%1")
txt=self.read(url)
if txt then
local title=txt:match("<title>(.-)</title>")
if title then
node.t=(node.seq and (node.seq.." ") or "")..title
end
end
end
local t=node.t and node.t:lower()
if t and p=="part" and (not node.c or #node.c==0) and last then
node.c={p=node.p,n=node.n}

for j=last,i+1,-1 do
local child=table.remove(root.docs[1].c,j)
--print(node.t,child.t)
Expand Down Expand Up @@ -555,6 +571,9 @@ function builder:processHTML(file,level)
if not file:lower():find("%.html?$") then return end
local prefix=string.rep("%.%./",level)
local txt=self.read(file)
if not txt then
error('error on opening file: '..file)
end
if self.is_javadoc then
txt=txt:gsub('(<script)(.-)(</script>)',function(a,b,c)
return a..b:gsub('&lt;','>'):gsub('&amp;','&'):gsub('&gt;','<')..c
Expand Down Expand Up @@ -601,7 +620,7 @@ function builder:processHTML(file,level)
txt=txt:gsub('%s*<footer>(.-)</footer>%s*',function(s)
if not s:find("nav%.gif") then return "" end
local left,right,copy='#','#',''
for url,dir in s:gmatch('<a%s+href="([^"]+)"[^>]*><img%s+[^>]+/(%w+)nav.gif"') do
for url,dir in s:gmatch('<a%s+href="([^"]+)"[^>]*><img%s+[^>]+src="(.-/(%w+)nav.gif)"') do
if dir=='left' then
left=url
else
Expand All @@ -611,10 +630,10 @@ function builder:processHTML(file,level)
copy=s:match("(Copyright[^<]+)") or "";
return ([[
<hr/><table><tr>
<td style="width:80px"><a href="%s"><img width="24" height="24" src="../dcommon/gifs/leftnav.gif" alt="Go to previous page" /><br/><span class="icon">Previous</span></a></td>
<td style="text-align:center;vertical-align:middle;font-size:9px"><img width="144" height="18" src="../dcommon/gifs/oracle.gif" alt="Oracle" /><br/>%s</td>
<td style="width:80px"><a href="%s"><img width="24" height="24" src="../dcommon/gifs/rightnav.gif" alt="Go to next page" /><br /><span class="icon">Next</span></a></td>
</tr></table>]]):format(left,copy,right)
<td style="width:80px"><a href="%s"><img width="24" height="24" src="%s/gifs/leftnav.gif" alt="Go to previous page" /><br/><span class="icon">Prev</span></a></td>
<td style="text-align:center;vertical-align:middle;font-size:9px"><img width="144" height="18" src="%s/gifs/oracle.gif" alt="Oracle" /><br/>%s</td>
<td style="width:80px;text-align:right"><a href="%s"><img width="24" height="24" src="%s/gifs/rightnav.gif" alt="Go to next page" /><br /><span class="icon">Next</span></a></td>
</tr></table>]]):format(left,dcommon_path,dcommon_path,copy,right,dcommon_path)
end)
txt=txt:gsub([[(%s*<script.-<%/script>%s*)]],'')
txt=txt:gsub('%s*<a href="#BEGIN".-</a>%s*','')
Expand Down
212 changes: 59 additions & 153 deletions htmlparser.lua
Original file line number Diff line number Diff line change
@@ -1,161 +1,67 @@
-- vim: ft=lua ts=2 sw=2

local esc = function(s) return string.gsub(s, "([%^%$%(%)%%%.%[%]%*%+%-%?])", "%%" .. "%1") end
local str = tostring
local char = string.char
local err = function(s) io.stderr:write(s) end
local out = function(s) io.stdout:write(s) end

local ElementNode = require("htmlparser.ElementNode")
local voidelements = require("htmlparser.voidelements")

local HtmlParser = {}

local tpr = {
-- Here we're replacing confusing sequences
-- (things looking like tags, but appearing where tags can't)
-- with definitelly invalid utf sequence, and later we'll replace them back
["<"] = char(208,209,208,209),
[">"] = char(209,208,209,208),
}

local function parse(text,limit)
local text=str(text)

local limit = limit or htmlparser_looplimit or 1000

local tpl = false

local function g(id,...)
local arg={...}
arg[id]=tpr[arg[id]]
tpl=true
return table.concat(arg)
end

text = text
:gsub(
"(<)"..
"([^>]-)"..
"(<)",
function(...)return g(3,...)end
):gsub(
"("..tpr["<"]..")"..
"([^%w%s])"..
"([^%2]-)"..
"(%2)"..
"(>)"..
"([^>]-)"..
"(>)",
function(...)return g(5,...)end
):gsub(
[=[(['"])]=]..
[=[([^'">%s]-)]=]..
"(>)"..
[=[([^'">%s]-)]=]..
[=[(['"])]=],
function(...)return g(3,...)end
)

local index = 0
local root = ElementNode:new(index, str(text))

local node, descend, tpos, opentags = root, true, 1, {}
while true do
if index == limit then
err("[HTMLParser] [ERR] Main loop reached loop limit ("..limit.."). Please, consider increasing it or check the code for errors")
break
end

local openstart, name
openstart, tpos, name = root._text:find(
"<" .. -- an uncaptured starting "<"
"([%w-]+)" .. -- name = the first word, directly following the "<"
"[^>]*>", -- include, but not capture everything up to the next ">"
tpos)

if not name then break end

index = index + 1

local tag = ElementNode:new(index, str(name), node, descend, openstart, tpos)
node = tag

local tagloop
local tagst, apos = tag:gettext(), 1
while true do
if tagloop == limit then
err("[HTMLParser] [ERR] tag parsing loop reached loop limit ("..limit.."). Please, consider increasing it or check the code for errors")
break
end

local start, k, eq, quote, v
start, apos, k, eq, quote = tagst:find(
"%s+" .. -- some uncaptured space
"([^%s=/>]+)" .. -- k = an unspaced string up to an optional "=" or the "/" or ">"
"(=?)" .. -- eq = the optional; "=", else ""
"(['\"]?)", -- quote = an optional "'" or '"' following the "=", or ""
apos)

if not k or k == "/>" or k == ">" then break end

if eq == "=" then
pattern = "=([^%s>]*)"
if quote ~= "" then
pattern = quote .. "([^" .. quote .. "]*)" .. quote
end
start, apos, v = tagst:find(pattern, apos)
end

v=v or ""

if tpl then
for rk,rv in pairs(tpr) do
v = v:gsub(rv,rk)
end
end

tag:addattribute(k, v)
tagloop = (tagloop or 0) + 1
end

if voidelements[tag.name:lower()] then
descend = false
tag:close()
else
opentags[tag.name] = opentags[tag.name] or {}
table.insert(opentags[tag.name], tag)
end

local closeend = tpos
local closingloop
while true do
if closingloop == limit then
err("[HTMLParser] [ERR] tag closing loop reached loop limit ("..limit.."). Please, consider increasing it or check the code for errors")
break
end

local closestart, closing, closename
closestart, closeend, closing, closename = root._text:find("[^<]*<(/?)([%w-]+)", closeend)

if not closing or closing == "" then break end

tag = table.remove(opentags[closename] or {}) or tag -- kludges for the cases of closing void or non-opened tags
closestart = root._text:find("<", closestart)
tag:close(closestart, closeend + 1)
node = tag.parent
descend = true
closingloop = (closingloop or 0) + 1
end
end

if tpl then
for k,v in pairs(tpr) do
root._text = root._text:gsub(v,k)
end
end

return root
local function parse(text)
local index = 0
local root = ElementNode:new(index, text)

local node, descend, tpos, opentags = root, true, 1, {}
while true do
local openstart, name
openstart, tpos, name = string.find(root._text,
"<" .. -- an uncaptured starting "<"
"([%w-]+)" .. -- name = the first word, directly following the "<"
"[^>]*>", -- include, but not capture everything up to the next ">"
tpos)
if not name then break end
index = index + 1
local tag = ElementNode:new(index, name, node, descend, openstart, tpos)
node = tag

local tagst, apos = tag:gettext(), 1
while true do
local start, k, eq, quote, v
start, apos, k, eq, quote = string.find(tagst,
"%s+" .. -- some uncaptured space
"([^%s=/>]+)" .. -- k = an unspaced string up to an optional "=" or the "/" or ">"
"(=?)" .. -- eq = the optional; "=", else ""
"(['\"]?)", -- quote = an optional "'" or '"' following the "=", or ""
apos)
if not k or k == "/>" or k == ">" then break end
if eq == "=" then
local pattern = "=([^%s>]*)"
if quote ~= "" then
pattern = quote .. "([^" .. quote .. "]*)" .. quote
end
start, apos, v = string.find(tagst, pattern, apos)
end
tag:addattribute(k, v or "")
end

if voidelements[string.lower(tag.name)] then
descend = false
tag:close()
else
opentags[tag.name] = opentags[tag.name] or {}
table.insert(opentags[tag.name], tag)
end

local closeend = tpos
while true do
local closestart, closing, closename
closestart, closeend, closing, closename = string.find(root._text, "[^<]*<(/?)([%w-]+)", closeend)
if not closing or closing == "" then break end
tag = table.remove(opentags[closename] or {}) or tag -- kludges for the cases of closing void or non-opened tags
closestart = string.find(root._text, "<", closestart)
tag:close(closestart, closeend + 1)
node = tag.parent
descend = true
end
end

return root
end
HtmlParser.parse = parse

Expand Down
Loading

0 comments on commit 77c5468

Please sign in to comment.