Skip to content

Commit

Permalink
update fallback parser to expect newlines
Browse files Browse the repository at this point in the history
  • Loading branch information
svilupp committed Dec 16, 2023
1 parent a44df44 commit 1d944f6
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 15 deletions.
53 changes: 40 additions & 13 deletions src/code_generation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -430,13 +430,22 @@ extract_code_blocks(markdown_multiple)
function extract_code_blocks(markdown_content::T) where {T <: AbstractString}
# Convert content and delimiters to codeunits
content_units = codeunits(markdown_content)
start_delim_units = codeunits("```julia")
end_delim_units = codeunits("```\n")
# Ideal code fences
start_delim_units1 = codeunits("\n```julia\n")
end_delim_units1 = codeunits("\n```\n")
# Fallback code fences
start_delim_units2 = codeunits("```julia\n")
end_delim_units2 = codeunits("\n```")

# Find all starting and ending positions of code blocks
start_positions = find_subsequence_positions(start_delim_units, content_units)
end_positions = setdiff(find_subsequence_positions(end_delim_units, content_units),
start_positions)
pos = find_subsequence_positions(start_delim_units1, content_units)
pos2 = find_subsequence_positions(start_delim_units2, content_units)
# the +1 offset is because the first pattern starts 1 character earlier
start_positions = vcat(pos2, pos .+ 1) |> unique

pos = find_subsequence_positions(end_delim_units1, content_units)
pos2 = find_subsequence_positions(end_delim_units2, content_units)
end_positions = vcat(pos, pos2) |> unique
unused_end_positions = trues(length(end_positions))

# Generate code block position pairs
Expand All @@ -461,7 +470,9 @@ function extract_code_blocks(markdown_content::T) where {T <: AbstractString}
eltype_ = typeof(@view(markdown_content[begin:end]))
code_blocks = Vector{eltype_}()
for (start_pos, end_pos) in filtered_positions
code_block = markdown_content[(start_pos + length(start_delim_units)):(end_pos - 1)]
start_ = (start_pos + length(start_delim_units2))
end_ = prevind(markdown_content, end_pos)
code_block = markdown_content[start_:end_]
# Also remove the julia prompt
push!(code_blocks, remove_julia_prompt(strip(code_block)))
end
Expand All @@ -470,7 +481,7 @@ function extract_code_blocks(markdown_content::T) where {T <: AbstractString}
end

"""
extract_code_blocks_fallback(markdown_content::String, delim::AbstractString="```")
extract_code_blocks_fallback(markdown_content::String, delim::AbstractString="\\n```\\n")
Extract Julia code blocks from a markdown string using a fallback method (splitting by arbitrary `delim`-iters).
Much more simplistic than `extract_code_blocks` and does not support nested code blocks.
Expand All @@ -497,22 +508,38 @@ code_parsed = extract_code_blocks_fallback(code) |> x -> filter(is_julia_code, x
```
"""
function extract_code_blocks_fallback(markdown_content::T,
delim::AbstractString = "```") where {T <: AbstractString}
delim::AbstractString = "\n```\n") where {T <: AbstractString}
# Convert content and delimiters to codeunits
content_units = codeunits(markdown_content)
content_length = length(content_units)
delim_units = codeunits(delim)
delim_positions = find_subsequence_positions(delim_units, content_units)

# Extract code blocks
eltype_ = typeof(@view(markdown_content[begin:end]))
code_blocks = Vector{eltype_}()
isempty(delim_positions) && return code_blocks
isempty(delim_positions) && !startswith(markdown_content, lstrip(delim)) &&
return code_blocks

# Run the extraction
start_pos = delim_positions[1]
for end_pos in delim_positions
if end_pos > start_pos
code_block = markdown_content[(start_pos + length(delim_units)):(end_pos - 1)]
# catch if we're missing the opening mark because of document start
no_newline_start = lstrip(delim)
start_pos = if no_newline_start != delim &&
startswith(markdown_content, no_newline_start)
(length(codeunits(no_newline_start)) - length(delim_units))
else
delim_positions[1]
end
no_new_line_end = rstrip(delim)
if no_new_line_end != delim && endswith(markdown_content, no_new_line_end)
last_end = 1 + content_length - length(codeunits(no_new_line_end))
push!(delim_positions, last_end)
end
# start the iteration
for end_pos in unique(delim_positions)
if end_pos > start_pos && end_pos <= content_length
end_ = prevind(markdown_content, end_pos)
code_block = markdown_content[(start_pos + length(delim_units)):end_]
# Also remove the julia prompt
push!(code_blocks, remove_julia_prompt(strip(code_block)))
# Reset the start
Expand Down
11 changes: 9 additions & 2 deletions test/code_generation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,12 @@ end
@test extract_code_blocks(markdown_content) ==
SubString{String}["println(\"Hello, World!\")"]

# at edges (no newlines)
markdown_content = """```julia
println("hello")
```"""
@test extract_code_blocks(markdown_content) ==
SubString{String}["println(\"hello\")"]
# Multiple Julia Code Blocks
markdown_content = """
```julia
Expand Down Expand Up @@ -270,13 +276,14 @@ end
@test isempty(extract_code_blocks_fallback("Some text without code blocks"))

# Adjacent Code Blocks Test
@test extract_code_blocks_fallback("```code1``` ```code2```") == ["code1", "", "code2"]
@test extract_code_blocks_fallback("```\ncode1\n```\n \n```\ncode2\n```") ==
["code1", "", "code2"]

# Special Characters Test
@test extract_code_blocks_fallback("```\n<>&\"'\n```") == ["<>&\"'"]

# Large Input Test
large_input = "```" * repeat("large code block\n", 10) * "```"
large_input = "```\n" * repeat("large code block\n", 10) * "```"
@test extract_code_blocks_fallback(large_input) ==
[strip(repeat("large code block\n", 10))]

Expand Down

0 comments on commit 1d944f6

Please sign in to comment.