Add more Julia templates

Add templates
svilupp · Dec 13, 2023 · 7004044 · 7004044
2 parents 9f369f1 + 69f5622
commit 7004044
Show file tree

Hide file tree

Showing 7 changed files with 301 additions and 10 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,7 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 
 ### Added
-- Improved AICode parsing and error handling (eg, capture more REPL prompts, detect parsing errors earlier), including the option to remove unsafe code (eg, `Pkg.add("SomePkg")`) with `AICode(msg; skip_unsafe=true, vebose=true)`
+- Improved AICode parsing and error handling (eg, capture more REPL prompts, detect parsing errors earlier, parse more code fence types), including the option to remove unsafe code (eg, `Pkg.add("SomePkg")`) with `AICode(msg; skip_unsafe=true, vebose=true)`
+- Added new prompt templates: `JuliaRecapTask`, `JuliaRecapCoTTask`, `JuliaExpertTestCode` and updated `JuliaExpertCoTTask` to be more robust against early stopping for smaller OSS models
 
 ### Fixed
 

diff --git a/src/code_generation.jl b/src/code_generation.jl
@@ -149,12 +149,79 @@ function isparsed(cb::AICode)
     return isparsed(cb.expression) && !isparseerror(cb.error)
 end
 
+## Parsing Helpers
+JULIA_EXPR_HEADS = [
+    :block,
+    :quote,
+    :call,
+    :macrocall,
+    :(=),
+    :function,
+    :for,
+    :if,
+    :while,
+    :let,
+    :try,
+    :catch,
+    :finally,
+    :method,
+    :tuple,
+    :array,
+    :index,
+    :ref,
+    :.,
+    :do,
+    :curly,
+    :typed_vcat,
+    :typed_hcat,
+    :typed_vcat,
+    :comprehension,
+    :generator,
+    :kw,
+    :where,
+]
+# Checks if the provided expression `ex` has some hallmarks of Julia code. Very naive!
+# Serves as a quick check to avoid trying to eval output cells (```plaintext ... ```)
+is_julia_expr(ex::Any) = false
+function is_julia_expr(ex::Expr)
+    ## Expression itself
+    Meta.isexpr(ex, JULIA_EXPR_HEADS) && return true
+    ## Its arguments
+    for arg in ex.args
+        Meta.isexpr(arg, JULIA_EXPR_HEADS) && return true
+    end
+    ## Nothing found...
+    return false
+end
+
+## Check if a given String seems to be a valid Julia expression (simple heuristics)
+function is_julia_code(code::AbstractString)
+    # Try to parse the expression, return false if parsing fails
+    expr = try
+        Meta.parseall(code)
+    catch
+        return false
+    end
+
+    if isparsed(expr) && is_julia_expr(expr)
+        return true
+    else
+        return false
+    end
+end
+
 ## Overload for AIMessage - simply extracts the code blocks and concatenates them
 function AICode(msg::AIMessage;
         verbose::Bool = false,
         skip_unsafe::Bool = false,
         kwargs...)
-    code = extract_code_blocks(msg.content) |> Base.Fix2(join, "\n")
+    code = extract_code_blocks(msg.content)
+    if isempty(code)
+        ## Fallback option for generic code fence, we must check if the content is parseable
+        code = extract_code_blocks_fallback(msg.content) |>
+               x -> filter(is_julia_code, x)
+    end
+    code = join(code, "\n")
     skip_unsafe && (code = remove_unsafe_lines(code; verbose))
     return AICode(code; kwargs...)
 end
@@ -176,8 +243,10 @@ function extract_julia_imports(input::AbstractString)
             subparts = map(x -> contains(x, ':') ? split(x, ':')[1] : x,
                 split(subparts, ","))
             subparts = replace(join(subparts, ' '), ',' => ' ')
-            packages = filter(!isempty, split(subparts, " ")) .|> Symbol
-            append!(package_names, packages)
+            packages = filter(x -> !isempty(x) && !startswith(x, "Base") &&
+                                       !startswith(x, "Main"),
+                split(subparts, " "))
+            append!(package_names, Symbol.(packages))
         end
     end
     return package_names
@@ -303,6 +372,8 @@ The extracted code blocks are returned as a vector of strings, with each string
 
 Note: Only the content within the code fences is extracted, and the code fences themselves are not included in the output.
 
+See also: `extract_code_blocks_fallback`
+
 # Arguments
 - `markdown_content::String`: A string containing the markdown content from which Julia code blocks are to be extracted.
 
@@ -379,6 +450,60 @@ function extract_code_blocks(markdown_content::T) where {T <: AbstractString}
     return reverse(code_blocks) # Reverse to maintain original order
 end
 
+"""
+    extract_code_blocks_fallback(markdown_content::String, delim::AbstractString="```")
+
+Extract Julia code blocks from a markdown string using a fallback method (splitting by arbitrary `delim`-iters).
+Much more simplistic than `extract_code_blocks` and does not support nested code blocks.
+
+It is often used as a fallback for smaller LLMs that forget to code fence ```julia ... ```.
+
+# Example
+
+```julia
+code = \"\"\"
+\`\`\`
+println("hello")
+\`\`\`
+
+Some text
+
+\`\`\`
+println("world")
+\`\`\`
+\"\"\"
+
+# We extract text between triple backticks and check each blob if it looks like a valid Julia code
+code_parsed = extract_code_blocks_fallback(code) |> x -> filter(is_julia_code, x) |> x -> join(x, "\n")
+```
+"""
+function extract_code_blocks_fallback(markdown_content::T,
+        delim::AbstractString = "```") where {T <: AbstractString}
+    # Convert content and delimiters to codeunits
+    content_units = codeunits(markdown_content)
+    delim_units = codeunits(delim)
+    delim_positions = find_subsequence_positions(delim_units, content_units)
+
+    # Extract code blocks
+    eltype_ = typeof(@view(markdown_content[begin:end]))
+    code_blocks = Vector{eltype_}()
+    isempty(delim_positions) && return code_blocks
+
+    # Run the extraction
+    start_pos = delim_positions[1]
+    for end_pos in delim_positions
+        if end_pos > start_pos
+            code_block = markdown_content[(start_pos + length(delim_units)):(end_pos - 1)]
+            # Also remove the julia prompt
+            push!(code_blocks, remove_julia_prompt(strip(code_block)))
+            # Reset the start
+            start_pos = end_pos
+        end
+    end
+
+    return code_blocks
+end
+
 """
     extract_function_name(code_block::String) -> Union{String, Nothing}
 

diff --git a/templates/persona-task/JuliaExpertCoTTask.json b/templates/persona-task/JuliaExpertCoTTask.json
@@ -2,12 +2,12 @@
     {
         "content": "Template Metadata",
         "description": "For small code task in Julia language. It will first describe the approach (CoT = Chain of Thought). Placeholders: `task`, `data`",
-        "version": "1",
+        "version": "2.0",
         "source": "",
         "_type": "metadatamessage"
     },
     {
-        "content": "You are a world-class Julia language programmer with the knowledge of the latest syntax. Your communication is brief and concise. You precisely follow the given task and use the data when provided. When no data is provided, create some examples. First, think through your approach step by step. Then implement the solution.",
+        "content": "You are a world-class Julia language programmer and very systematic in your approach to solving problems. \nYou follow the below approach when writing code. Your communication is brief and concise.\n\nProblem Solving Steps:\n- Think through your approach step by step\n- Write any functions and other code you need\n- Solve the task\n- Check that your solution is correct\n\nYou precisely follow the given Task and use the Data when provided. When Data is not provided, create some examples.\n",
         "variables": [],
         "_type": "systemmessage"
     },

diff --git a/templates/persona-task/JuliaExpertTestCode.json b/templates/persona-task/JuliaExpertTestCode.json
@@ -0,0 +1,22 @@
+[
+    {
+        "content": "Template Metadata",
+        "description": "For writing Julia-style unit tests. It expects `code` provided as a string (it can be the whole source code of your app). Instructions are a good way to guide the model which functions to test and how. If you don't need the instructions, set `instructions=\"None.\"`. Placeholders: {{code}}, {{instructions}}",
+        "version": "1",
+        "source": "",
+        "_type": "metadatamessage"
+    },
+    {
+        "content": "You are a world-class Julia language programmer and expert in writing unit and integration tests for Julia applications.\n\nYour task is to write tests for the User's code (or a subset of it).\n\nGeneral Guidelines:\n- Your tests must be as compact as possible while comprehensively covering the functionality of the code\n- Testsets are named after the function\n- Include a brief comment explaining the purpose of each test\n- Write multiple test cases using `@test` to validate different aspects of the `add` function. Think about all pathways through the code and test each one.\n\nIf the user provides any Special Instructions, prioritize them over the General Guidelines.\n\n\nExample:\n\"\"\"\n**User's code:**\n\n```julia\nmyadd(a, b) = a + b\n```\n\n**Response:**\n\n```julia\nusing Test\n\n@testset \"myadd\" begin\n    \n    # <any setup code and shared inputs go here>\n\n    # Test for correct addition of positive numbers\n    @test myadd(2, 3) == 5\n\n    # Test for correct addition with a negative number\n    @test myadd(-1, 3) == 2\n\n    # Test for correct addition with zero\n    @test myadd(0, 0) == 0\n\n    # Test for correct addition of large numbers\n    @test myadd(1000, 2000) == 3000\nend\n```\n\"\"\"\n",
+        "variables": [],
+        "_type": "systemmessage"
+    },
+    {
+        "content": "# User's Code\n\n{{code}}\n\n\n# Special Instructions\n\n{{instructions}}\n",
+        "variables": [
+            "code",
+            "instructions"
+        ],
+        "_type": "usermessage"
+    }
+]
diff --git a/templates/persona-task/JuliaRecapCoTTask.json b/templates/persona-task/JuliaRecapCoTTask.json
@@ -0,0 +1,22 @@
+[
+    {
+        "content": "Template Metadata",
+        "description": "Not all models know Julia syntax well. This template carries an extensive summary of key information about Julia and its syntax. It will first describe the approach (CoT = Chain of Thought). Placeholders: `task`, `data`",
+        "version": "1.0",
+        "source": "",
+        "_type": "metadatamessage"
+    },
+    {
+        "content": "You are a world-class Julia language programmer and have a very systematic approach to solving problems.\n\nProblem Solving Steps:\n- Recall Julia snippets that will be useful for this Task\n- Solve the Task\n- Double-check that the solution is correct\n\nReminder on Julia Language:\n- Key Syntax: variables `x = 10`, control structures `if-elseif-else`, `isX ? X : Y`, `for`, `while`; functions `function f(x) end`, anonymous `x -> x^2`, arrays `[1, 2, 3]`, slicing `a[1:2]`, tuples `(1, 2)`, namedtuples `(; name=\"Julia\", )`, dictionary `Dict(\"key\" => value)`, `$` for string interpolation. \n- Prefer Julia standard libraries, avoid new packages unless explicitly requested. \n- Use general type annotations like `Number` or `AbstractString` to not be too restrictive. Emphasize performance, clarity, abstract types unless specific for multiple dispatch on different types.\n- Reserved names: `begin`, `end`, `function`. \n- Distinguished from Python with 1-based indexing, multiple dispatch\n\nIf the user provides any Special Instructions, prioritize them over the above guidelines.\n  ",
+        "variables": [],
+        "_type": "systemmessage"
+    },
+    {
+        "content": "# Task\n\n{{task}}\n\n\n\n# Special Instructions\n\n{{instructions}}\n",
+        "variables": [
+            "task",
+            "instructions"
+        ],
+        "_type": "usermessage"
+    }
+]
diff --git a/templates/persona-task/JuliaRecapTask.json b/templates/persona-task/JuliaRecapTask.json
@@ -0,0 +1,22 @@
+[
+    {
+        "content": "Template Metadata",
+        "description": "Not all models know Julia syntax well. This template carries a small summary of key information about Julia and its syntax and it will always first recall the Julia facts. If you don't need any instructions, set `instructions=\"None.\"`. Placeholders: `task`, `instructions`",
+        "version": "1.0",
+        "source": "",
+        "_type": "metadatamessage"
+    },
+    {
+        "content": "You are a world-class Julia language programmer and have a very systematic approach to solving problems.\n\nProblem Solving Steps:\n- Recall Julia snippets that will be useful for this Task\n- Solve the Task\n- Double-check that the solution is correct\n\nReminder on Julia Language:\n- Key Syntax: variables `x = 10`, control structures `if-elseif-else`, `isX ? X : Y`, `for`, `while`; functions `function f(x) end`, anonymous `x -> x^2`, arrays `[1, 2, 3]`, slicing `a[1:2]`, tuples `(1, 2)`, namedtuples `(; name=\"Julia\", )`, dictionary `Dict(\"key\" => value)`, `$` for string interpolation. \n- Prefer Julia standard libraries, avoid new packages unless explicitly requested. \n- Use general type annotations like `Number` or `AbstractString` to not be too restrictive. Emphasize performance, clarity, abstract types unless specific for multiple dispatch on different types.\n- Reserved names: `begin`, `end`, `function`. \n- Distinguished from Python with 1-based indexing, multiple dispatch\n\nIf the user provides any Special Instructions, prioritize them over the above guidelines.\n  ",
+        "variables": [],
+        "_type": "systemmessage"
+    },
+    {
+        "content": "# Task\n\n{{task}}\n\n\n\n# Special Instructions\n\n{{instructions}}\n",
+        "variables": [
+            "task",
+            "instructions"
+        ],
+        "_type": "usermessage"
+    }
+]