diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..4b77896
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,24 @@
+JL = julia --project
+
+default: init test
+
+init:
+	$(JL) -e 'using Pkg; Pkg.precompile(); Pkg.activate("docs"); Pkg.develop(path="."), Pkg.precompile()'
+
+update:
+	$(JL) -e 'using Pkg; Pkg.update(); Pkg.precompile(); Pkg.activate("docs"); Pkg.update(); Pkg.precompile()'
+
+test:
+	$(JL) -e 'using Pkg; Pkg.test()'
+
+coverage:
+	$(JL) -e 'using Pkg; Pkg.test(; coverage=true)'
+
+serve:
+	$(JL) -e 'using Pkg; Pkg.activate("docs"); using LiveServer; servedocs(;skip_dirs=["docs/src/assets", "docs/src/generated"])'
+
+clean:
+	rm -rf docs/build
+	find . -name "*.cov" -type f -print0 | xargs -0 /bin/rm -f
+
+.PHONY: init test coverage serve clean update
\ No newline at end of file
diff --git a/README.md b/README.md
index aff2941..e2df687 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ Einstein summation can be implemented in no more than 20 lines of Julia code, th
 
 *Note: why the test coverage is not 100%* - GPU-code coverage is not evaluated although we test the GPU code properly on gitlab. Ignoring the GPU-code, the actual coverage is at about _97%_.
 
-*Warning: since v0.4, OMEinsum does not optimize the contraction order anymore. One has to use nested einsum to specify the contraction order manually, e.g. `ein"(ijk,jkl),klm->im"(x, y, z)`.*
+*Warning: since v0.4, OMEinsum does not optimize the contraction order anymore. One has to use nested einsum to specify the contraction order manually, e.g. `ein"(ijk,jkl),klm->im"(x, y, z)`.* Please check out the [documentation](https://under-Peter.github.io/OMEinsum.jl/dev/contractionorder/) for more details.
 
 ## Install
 
@@ -89,32 +89,7 @@ which is closer to the standard way of writing einsum-operations in physics
 julia> @ein c[i,j] := a[i,k] * b[k,j];
 ```
 
-#### A table for reference
-| code             | meaning         |
-| ---------------- | --------------- |
-| `ein"ij,jk->ik"`   | matrix matrix multiplication |
-| `ein"ijl,jkl->ikl"`   | batched - matrix matrix multiplication |
-| `ein"ij,j->i"`   | matrix vector multiplication |
-| `ein"ij,ik,il->jkl"`   | star contraction |
-| `ein"ii->"`   | trace |
-| `ein"ij->i"` | sum |
-| `ein"ii->i"` | take the diagonal part of a matrix |
-| `ein"ijkl->ilkj"` | permute the dimensions of a tensor |
-| `ein"i->ii"` | construct a diagonal matrix |
-| `ein"->ii"`  | broadcast a scalar to the diagonal part of a matrix |
-| `ein"ij,ij->ij"`  | element wise product |
-| `ein"ij,kl->ijkl"`  | outer product |
-
-
-Many of these are handled by special kernels 
-([listed in the docs](https://under-peter.github.io/OMEinsum.jl/stable/implementation/)),
-but there is also a fallback which handles other cases 
-(more like what [Einsum.jl](https://github.com/ahwillia/Einsum.jl) does, plus a GPU version).
-
-It is sometimes helpful to specify the order of operations, by inserting brackets,
-either because you know this will be more efficient, 
-or to help the computer see what kernels can be used. 
-For example:
+It is sometimes helpful to specify the order of operations, by inserting brackets, either because you know this will be more efficient,  or to help the computer see what kernels can be used.  For example:
 ```julia
 julia> @ein Z[o,s] := x[i,s] * (W[o,i,j] * y[j,s]);   # macro style
 
@@ -140,107 +115,13 @@ julia> Zl = ein"is, oij, js -> os"(x, W, y);
 └ @ OMEinsum ~/.julia/dev/OMEinsum/src/loop_einsum.jl:26
 ```
 
-To see more examples using the GPU and autodiff, check out our asciinema-demo here:
-[![asciicast](https://asciinema.org/a/wE4CtIzWUC3R0GkVV28rVBRFb.svg)](https://asciinema.org/a/wE4CtIzWUC3R0GkVV28rVBRFb)
-
-## Application
-
-For an application in tensor network algorithms, check out the [TensorNetworkAD](https://github.com/under-Peter/TensorNetworkAD.jl)
-package, where `OMEinsum` is used to evaluate tensor-contractions, permutations and summations.
-
-#### Toy Application: solving a 3-coloring problem on the Petersen graph
-Let us focus on graphs
-with vertices with three edges each. A question one might ask is:
-How many different ways are there to colour the edges of the graph with
-three different colours such that no vertex has a duplicate colour on its edges?
-
-The counting problem can be transformed into a contraction of rank-3 tensors
-representing the edges. Consider the tensor `s` defined as
-```julia
-julia> s = map(x->Int(length(unique(x.I)) == 3), CartesianIndices((3,3,3)))
-```
-
-Then we can simply contract `s` tensors to get the number of 3 colourings satisfying the above condition!
-E.g. for two vertices, we get 6 distinct colourings:
-```julia
-julia> ein"ijk,ijk->"(s,s)[]
-6
-```
-
-Using that method, it's easy to find that e.g. the peterson graph allows no 3 colouring, since
-```julia
-julia> code = ein"afl,bhn,cjf,dlh,enj,ago,big,cki,dmk,eom->"
-afl, bhn, cjf, dlh, enj, ago, big, cki, dmk, eom 
-
-julia> code(fill(s, 10)...)[]
-0
-```
-
-The peterson graph consists of 10 vertices and 15 edges and looks like a pentagram
-embedded in a pentagon as depicted here:
-
-![](https://upload.wikimedia.org/wikipedia/commons/thumb/f/f5/Petersen_graph.svg/252px-Petersen_graph.svg.png)
-
-`OMEinsum` does not optimie the contraction order by default, so the above contraction can be time consuming. To speed up the contraction, we can use `optimize_code` to optimize the contraction order:
-```julia
-julia> optcode = optimize_code(code, uniformsize(code, 3), TreeSA())
-SlicedEinsum{Char, DynamicNestedEinsum{Char}}(Char[], ago, goa -> 
-├─ ago
-└─ gcojl, cjal -> goa
-   ├─ bgck, bojlk -> gcojl
-   │  ├─ big, cki -> bgck
-   │  │  ├─ big
-   │  │  └─ cki
-   │  └─ bhomj, lhmk -> bojlk
-   │     ├─ bhn, omnj -> bhomj
-   │     │  ├─ bhn
-   │     │  └─ eom, enj -> omnj
-   │     │     ⋮
-   │     │     
-   │     └─ dlh, dmk -> lhmk
-   │        ├─ dlh
-   │        └─ dmk
-   └─ cjf, afl -> cjal
-      ├─ cjf
-      └─ afl
-)
-
-julia> contraction_complexity(optcode, uniformsize(optcode, 3))
-Time complexity: 2^12.737881076857779
-Space complexity: 2^7.92481250360578
-Read-write complexity: 2^11.247334178028728
-
-julia> optcode(fill(s, 10)...)[]
-0
-```
-We can see the time complexity of the optimized code is much smaller than the original one. To know more about the contraction order optimization, please check the julia package [`OMEinsumContractionOrders.jl`](https://github.com/TensorBFS/OMEinsumContractionOrders.jl).
-
-Confronted with the above result, we can ask whether the peterson graph allows a relaxed variation of 3 colouring, having one vertex that might accept duplicate colours. The answer to that can be found using the gradient w.r.t a vertex:
-```julia
-julia> using Zygote: gradient
-
-julia> gradient(x->optcode(x,s,s,s,s,s,s,s,s,s)[], s)[1] |> sum
-0
-```
-This tells us that even if we allow duplicates on one vertex, there are no 3-colourings for the peterson graph.
-
 ## Comparison with other packages
 Similar packages include:
 - [TensorOperations.jl](https://github.com/Jutho/TensorOperations.jl) and [TensorKit.jl](https://github.com/Jutho/TensorKit.jl)
 - [ITensors.jl](https://github.com/ITensor/ITensors.jl)
 
-Comparing with the above packages, `OMEinsum` is optimized over large scale tensor network (or einsum, sum-product network) contraction. Its main advantages are:
-- `OMEinsum` has better support to very high dimensional tensor networks and their contraction order.
-- `OMEinsum` allows an index to appear multiple times.
-- `OMEinsum` has well tested generic element type support.
-
-However, `OMEinsum` also has some disadvantages:
-- `OMEinsum` does not support good quantum numbers.
-- `OMEinsum` has less optimization on small scale problems.
+Comparing with the above packages, `OMEinsum` is optimized over large scale tensor network (or einsum, sum-product network) contraction.
 
 ## Contribute
 
-Suggestions and Comments in the _Issues_ are welcome.
-
-## License
-MIT License
+Suggestions and Comments in the [_Issues_](https://github.com/under-Peter/OMEinsum.jl/issues) are welcome.
\ No newline at end of file
diff --git a/docs/Project.toml b/docs/Project.toml
index 8f7ee43..3964eda 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -1,5 +1,6 @@
 [deps]
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+LiveServer = "16fef848-5104-11e9-1b77-fb7a48bbb589"
 OMEinsum = "ebe7aa44-baf0-506c-a96f-8464559b3922"
 OMEinsumContractionOrders = "6f22d1fd-8eed-4bb7-9776-e7d684900715"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
diff --git a/docs/make.jl b/docs/make.jl
index ba56981..7769e8c 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -5,11 +5,13 @@ makedocs(;
     format=Documenter.HTML(),
     pages=[
         "Home" => "index.md",
-        "Parsing" => "parsing.md",
-        "Implementations" => "implementation.md",
+        "Background: Tensor Networks" => "background.md",
+        "Basic usage" => "basic.md",
         "Contraction order optimization" => "contractionorder.md",
-        "Extending OMEinsum" => "extending.md",
-        "DocStrings" => "docstrings.md"
+        "Automatic differentiation" => "autodiff.md",
+        "CUDA" => "cuda.md",
+        "Applications" => "applications.md",
+        "Manual" => "docstrings.md"
     ],
     repo="https://github.com/under-Peter/OMEinsum.jl/blob/{commit}{path}#L{line}",
     sitename="OMEinsum.jl",
diff --git a/docs/src/applications.md b/docs/src/applications.md
new file mode 100644
index 0000000..c8a0f59
--- /dev/null
+++ b/docs/src/applications.md
@@ -0,0 +1,55 @@
+# Application
+
+## List of packages using OMEinsum
+- [GenericTensorNetworks](https://github.com/QuEraComputing/GenericTensorNetworks.jl), solving combinational optimization problems by generic tensor networks.
+- [TensorInference](https://github.com/TensorBFS/TensorInference.jl), probabilistic inference using contraction of tensor networks
+- [YaoToEinsum](https://github.com/QuantumBFS/Yao.jl), the tensor network simulation backend for quantum circuits.
+- [TensorNetworkAD2](https://github.com/YidaiZhang/TensorNetworkAD2.jl), using differential programming tensor networks to solve quantum many-body problems.
+- [TensorQEC](https://github.com/nzy1997/TensorQEC.jl), tensor networks for quantum error correction.
+
+## Example: Solving a 3-coloring problem on the Petersen graph
+Let us focus on graphs
+with vertices with three edges each. A question one might ask is:
+How many different ways are there to colour the edges of the graph with
+three different colours such that no vertex has a duplicate colour on its edges?
+
+The counting problem can be transformed into a contraction of rank-3 tensors
+representing the edges. Consider the tensor `s` defined as
+```@repl coloring
+using OMEinsum
+s = map(x->Int(length(unique(x.I)) == 3), CartesianIndices((3,3,3)))
+```
+
+Then we can simply contract `s` tensors to get the number of 3 colourings satisfying the above condition!
+E.g. for two vertices, we get 6 distinct colourings:
+```@repl coloring
+ein"ijk,ijk->"(s,s)[]
+```
+
+Using that method, it's easy to find that e.g. the peterson graph allows no 3 colouring, since
+```@repl coloring
+code = ein"afl,bhn,cjf,dlh,enj,ago,big,cki,dmk,eom->"
+afl, bhn, cjf, dlh, enj, ago, big, cki, dmk, eom 
+code(fill(s, 10)...)[]
+```
+
+The peterson graph consists of 10 vertices and 15 edges and looks like a pentagram
+embedded in a pentagon as depicted here:
+
+![](https://upload.wikimedia.org/wikipedia/commons/thumb/f/f5/Petersen_graph.svg/252px-Petersen_graph.svg.png)
+
+`OMEinsum` does not optimie the contraction order by default, so the above contraction can be time consuming. To speed up the contraction, we can use `optimize_code` to optimize the contraction order:
+```@repl coloring
+optcode = optimize_code(code, uniformsize(code, 3), TreeSA())
+contraction_complexity(optcode, uniformsize(optcode, 3))
+optcode(fill(s, 10)...)[]
+```
+We can see the time complexity of the optimized code is much smaller than the original one. To know more about the contraction order optimization, please check the Julia package [`OMEinsumContractionOrders.jl`](https://github.com/TensorBFS/OMEinsumContractionOrders.jl).
+
+Confronted with the above result, we can ask whether the peterson graph allows a relaxed variation of 3 colouring, having one vertex that might accept duplicate colours. The answer to that can be found using the gradient w.r.t a vertex:
+```@repl coloring
+using Zygote: gradient
+gradient(x->optcode(x,s,s,s,s,s,s,s,s,s)[], s)[1] |> sum
+```
+This tells us that even if we allow duplicates on one vertex, there are no 3-colourings for the peterson graph.
+
diff --git a/docs/src/assets/matmul.png b/docs/src/assets/matmul.png
new file mode 100644
index 0000000..f16d4ec
Binary files /dev/null and b/docs/src/assets/matmul.png differ
diff --git a/docs/src/assets/perm.svg b/docs/src/assets/perm.svg
new file mode 100644
index 0000000..d572238
--- /dev/null
+++ b/docs/src/assets/perm.svg
@@ -0,0 +1,36 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="260" height="90" viewBox="0 0 260 90">
+<defs>
+<g>
+<g id="glyph-0-0">
+<path d="M 8.886719 -5.878906 L 6.710938 -12.21875 L 4.394531 -5.878906 Z M 5.695312 -14.34375 L 7.890625 -14.34375 L 13.09375 0 L 10.96875 0 L 9.511719 -4.296875 L 3.835938 -4.296875 L 2.285156 0 L 0.292969 0 Z M 5.695312 -14.34375 "/>
+</g>
+<g id="glyph-0-1">
+<path d="M 6.914062 -8.28125 C 7.734375 -8.28125 8.371094 -8.394531 8.828125 -8.625 C 9.542969 -8.980469 9.902344 -9.625 9.902344 -10.554688 C 9.902344 -11.492188 9.523438 -12.125 8.757812 -12.453125 C 8.328125 -12.632812 7.691406 -12.726562 6.84375 -12.726562 L 3.378906 -12.726562 L 3.378906 -8.28125 Z M 7.570312 -1.660156 C 8.761719 -1.660156 9.609375 -2.003906 10.117188 -2.695312 C 10.4375 -3.132812 10.59375 -3.660156 10.59375 -4.277344 C 10.59375 -5.320312 10.128906 -6.027344 9.199219 -6.40625 C 8.703125 -6.609375 8.050781 -6.710938 7.234375 -6.710938 L 3.378906 -6.710938 L 3.378906 -1.660156 Z M 1.476562 -14.34375 L 7.636719 -14.34375 C 9.316406 -14.34375 10.511719 -13.84375 11.21875 -12.84375 C 11.636719 -12.25 11.84375 -11.566406 11.84375 -10.789062 C 11.84375 -9.886719 11.589844 -9.144531 11.074219 -8.5625 C 10.808594 -8.257812 10.421875 -7.976562 9.921875 -7.726562 C 10.65625 -7.445312 11.207031 -7.128906 11.570312 -6.777344 C 12.214844 -6.152344 12.539062 -5.289062 12.539062 -4.1875 C 12.539062 -3.265625 12.25 -2.429688 11.671875 -1.679688 C 10.804688 -0.558594 9.425781 0 7.539062 0 L 1.476562 0 Z M 1.476562 -14.34375 "/>
+</g>
+<g id="glyph-0-2">
+<path d="M 7.570312 -14.734375 C 9.386719 -14.734375 10.792969 -14.257812 11.796875 -13.300781 C 12.800781 -12.34375 13.355469 -11.257812 13.46875 -10.039062 L 11.570312 -10.039062 C 11.355469 -10.964844 10.929688 -11.695312 10.289062 -12.234375 C 9.648438 -12.777344 8.746094 -13.046875 7.585938 -13.046875 C 6.175781 -13.046875 5.035156 -12.550781 4.164062 -11.558594 C 3.296875 -10.566406 2.859375 -9.042969 2.859375 -6.992188 C 2.859375 -5.3125 3.253906 -3.949219 4.039062 -2.90625 C 4.824219 -1.859375 5.992188 -1.335938 7.546875 -1.335938 C 8.980469 -1.335938 10.070312 -1.886719 10.820312 -2.988281 C 11.21875 -3.566406 11.515625 -4.328125 11.710938 -5.273438 L 13.601562 -5.273438 C 13.433594 -3.761719 12.875 -2.496094 11.921875 -1.476562 C 10.785156 -0.246094 9.25 0.371094 7.3125 0.371094 C 5.648438 0.371094 4.25 -0.132812 3.117188 -1.140625 C 1.625 -2.476562 0.878906 -4.539062 0.878906 -7.324219 C 0.878906 -9.441406 1.4375 -11.175781 2.558594 -12.53125 C 3.769531 -14 5.4375 -14.734375 7.570312 -14.734375 Z M 7.570312 -14.734375 "/>
+</g>
+</g>
+</defs>
+<rect x="-26" y="-9" width="312" height="108" fill="rgb(100%, 100%, 100%)" fill-opacity="1"/>
+<path fill="none" stroke-width="2" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 80 30 C 80 41.046875 71.046875 50 60 50 C 48.953125 50 40 41.046875 40 30 C 40 18.953125 48.953125 10 60 10 C 71.046875 10 80 18.953125 80 30 Z M 80 30 "/>
+<path fill="none" stroke-width="2" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 150 30 C 150 41.046875 141.046875 50 130 50 C 118.953125 50 110 41.046875 110 30 C 110 18.953125 118.953125 10 130 10 C 141.046875 10 150 18.953125 150 30 Z M 150 30 "/>
+<path fill="none" stroke-width="2" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 220 30 C 220 41.046875 211.046875 50 200 50 C 188.953125 50 180 41.046875 180 30 C 180 18.953125 188.953125 10 200 10 C 211.046875 10 220 18.953125 220 30 Z M 220 30 "/>
+<path fill="none" stroke-width="2" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 10 30 L 40 30 "/>
+<path fill="none" stroke-width="2" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 80 30 L 110 30 "/>
+<path fill="none" stroke-width="2" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 150 30 L 180 30 "/>
+<path fill="none" stroke-width="2" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 220 30 L 250 30 "/>
+<path fill="none" stroke-width="2" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 10 80 L 250 80 "/>
+<path fill="none" stroke-width="2" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 10 30 L 10 80 "/>
+<path fill="none" stroke-width="2" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 250 30 L 250 80 "/>
+<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
+<use xlink:href="#glyph-0-0" x="53.330078" y="37.172852"/>
+</g>
+<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
+<use xlink:href="#glyph-0-1" x="123.330078" y="37.172852"/>
+</g>
+<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
+<use xlink:href="#glyph-0-2" x="192.77832" y="37.368164"/>
+</g>
+</svg>
diff --git a/docs/src/assets/starcontract.png b/docs/src/assets/starcontract.png
new file mode 100644
index 0000000..43e5a8c
Binary files /dev/null and b/docs/src/assets/starcontract.png differ
diff --git a/docs/src/assets/tensors.svg b/docs/src/assets/tensors.svg
new file mode 100644
index 0000000..a31dee4
--- /dev/null
+++ b/docs/src/assets/tensors.svg
@@ -0,0 +1,112 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="480" height="130" viewBox="0 0 480 130">
+<defs>
+<g>
+<g id="glyph-0-0">
+<path d="M 1.75 -2.460938 C 1.789062 -2.023438 1.898438 -1.683594 2.078125 -1.449219 C 2.410156 -1.023438 2.988281 -0.8125 3.808594 -0.8125 C 4.296875 -0.8125 4.726562 -0.917969 5.097656 -1.132812 C 5.46875 -1.34375 5.65625 -1.671875 5.65625 -2.117188 C 5.65625 -2.453125 5.503906 -2.710938 5.207031 -2.886719 C 5.015625 -2.992188 4.640625 -3.117188 4.078125 -3.257812 L 3.03125 -3.523438 C 2.363281 -3.6875 1.871094 -3.875 1.554688 -4.078125 C 0.988281 -4.4375 0.703125 -4.929688 0.703125 -5.558594 C 0.703125 -6.300781 0.96875 -6.902344 1.503906 -7.359375 C 2.039062 -7.820312 2.757812 -8.050781 3.664062 -8.050781 C 4.84375 -8.050781 5.695312 -7.703125 6.21875 -7.007812 C 6.546875 -6.570312 6.703125 -6.097656 6.695312 -5.589844 L 5.449219 -5.589844 C 5.425781 -5.886719 5.320312 -6.15625 5.132812 -6.402344 C 4.832031 -6.75 4.304688 -6.921875 3.558594 -6.921875 C 3.0625 -6.921875 2.683594 -6.828125 2.429688 -6.636719 C 2.171875 -6.445312 2.042969 -6.195312 2.042969 -5.882812 C 2.042969 -5.539062 2.210938 -5.265625 2.546875 -5.0625 C 2.742188 -4.9375 3.03125 -4.832031 3.414062 -4.738281 L 4.285156 -4.527344 C 5.230469 -4.296875 5.867188 -4.074219 6.1875 -3.859375 C 6.703125 -3.523438 6.957031 -2.992188 6.957031 -2.269531 C 6.957031 -1.570312 6.691406 -0.96875 6.164062 -0.460938 C 5.632812 0.046875 4.828125 0.300781 3.742188 0.300781 C 2.574219 0.300781 1.75 0.0351562 1.261719 -0.496094 C 0.777344 -1.023438 0.515625 -1.679688 0.484375 -2.460938 Z M 1.75 -2.460938 "/>
+</g>
+<g id="glyph-0-1">
+<path d="M 3.992188 -8.070312 C 4.875 -8.070312 5.59375 -7.855469 6.148438 -7.425781 C 6.703125 -6.996094 7.035156 -6.257812 7.148438 -5.207031 L 5.867188 -5.207031 C 5.789062 -5.691406 5.609375 -6.09375 5.332031 -6.414062 C 5.054688 -6.734375 4.605469 -6.890625 3.992188 -6.890625 C 3.152344 -6.890625 2.550781 -6.480469 2.191406 -5.660156 C 1.957031 -5.128906 1.839844 -4.472656 1.839844 -3.691406 C 1.839844 -2.90625 2.003906 -2.242188 2.335938 -1.707031 C 2.667969 -1.167969 3.191406 -0.902344 3.902344 -0.902344 C 4.449219 -0.902344 4.882812 -1.066406 5.203125 -1.402344 C 5.523438 -1.738281 5.746094 -2.195312 5.867188 -2.777344 L 7.148438 -2.777344 C 7 -1.734375 6.636719 -0.976562 6.050781 -0.496094 C 5.464844 -0.015625 4.714844 0.226562 3.800781 0.226562 C 2.777344 0.226562 1.957031 -0.148438 1.347656 -0.898438 C 0.738281 -1.648438 0.433594 -2.582031 0.433594 -3.707031 C 0.433594 -5.082031 0.765625 -6.15625 1.4375 -6.921875 C 2.105469 -7.6875 2.957031 -8.070312 3.992188 -8.070312 Z M 3.992188 -8.070312 "/>
+</g>
+<g id="glyph-0-2">
+<path d="M 1.976562 -2.085938 C 1.976562 -1.707031 2.117188 -1.40625 2.394531 -1.1875 C 2.671875 -0.96875 3.003906 -0.855469 3.382812 -0.855469 C 3.847656 -0.855469 4.296875 -0.964844 4.730469 -1.179688 C 5.464844 -1.535156 5.828125 -2.117188 5.828125 -2.929688 L 5.828125 -3.992188 C 5.667969 -3.890625 5.460938 -3.804688 5.207031 -3.734375 C 4.953125 -3.667969 4.703125 -3.617188 4.460938 -3.589844 L 3.664062 -3.484375 C 3.183594 -3.421875 2.824219 -3.324219 2.585938 -3.1875 C 2.179688 -2.957031 1.976562 -2.589844 1.976562 -2.085938 Z M 5.171875 -4.753906 C 5.472656 -4.792969 5.675781 -4.917969 5.777344 -5.132812 C 5.835938 -5.25 5.867188 -5.421875 5.867188 -5.640625 C 5.867188 -6.089844 5.707031 -6.414062 5.386719 -6.617188 C 5.066406 -6.820312 4.609375 -6.921875 4.015625 -6.921875 C 3.324219 -6.921875 2.835938 -6.734375 2.546875 -6.363281 C 2.386719 -6.160156 2.28125 -5.855469 2.234375 -5.449219 L 1.003906 -5.449219 C 1.027344 -6.414062 1.339844 -7.089844 1.945312 -7.46875 C 2.546875 -7.847656 3.246094 -8.035156 4.042969 -8.035156 C 4.964844 -8.035156 5.714844 -7.859375 6.292969 -7.507812 C 6.863281 -7.15625 7.148438 -6.609375 7.148438 -5.867188 L 7.148438 -1.347656 C 7.148438 -1.210938 7.175781 -1.101562 7.234375 -1.019531 C 7.289062 -0.9375 7.40625 -0.894531 7.585938 -0.894531 C 7.644531 -0.894531 7.710938 -0.898438 7.785156 -0.90625 C 7.859375 -0.914062 7.9375 -0.921875 8.019531 -0.9375 L 8.019531 0.0351562 C 7.816406 0.09375 7.660156 0.132812 7.550781 0.148438 C 7.445312 0.160156 7.296875 0.167969 7.113281 0.167969 C 6.65625 0.167969 6.328125 0.0078125 6.125 -0.316406 C 6.015625 -0.484375 5.941406 -0.726562 5.894531 -1.039062 C 5.625 -0.6875 5.242188 -0.382812 4.738281 -0.125 C 4.234375 0.132812 3.679688 0.265625 3.078125 0.265625 C 2.347656 0.265625 1.753906 0.0429688 1.292969 -0.398438 C 0.832031 -0.839844 0.601562 -1.394531 0.601562 -2.058594 C 0.601562 -2.785156 0.828125 -3.351562 1.28125 -3.75 C 1.734375 -4.148438 2.332031 -4.398438 3.070312 -4.488281 Z M 5.171875 -4.753906 "/>
+</g>
+<g id="glyph-0-3">
+<path d="M 1.003906 -10.757812 L 2.320312 -10.757812 L 2.320312 0 L 1.003906 0 Z M 1.003906 -10.757812 "/>
+</g>
+<g id="glyph-0-4">
+<path d="M 1.003906 -7.84375 L 2.257812 -7.84375 L 2.257812 -6.488281 C 2.359375 -6.753906 2.609375 -7.074219 3.011719 -7.453125 C 3.410156 -7.832031 3.871094 -8.019531 4.394531 -8.019531 C 4.417969 -8.019531 4.460938 -8.015625 4.519531 -8.011719 C 4.578125 -8.007812 4.679688 -8 4.820312 -7.984375 L 4.820312 -6.59375 C 4.742188 -6.605469 4.667969 -6.617188 4.601562 -6.621094 C 4.535156 -6.625 4.464844 -6.628906 4.386719 -6.628906 C 3.722656 -6.628906 3.210938 -6.414062 2.855469 -5.988281 C 2.5 -5.5625 2.320312 -5.070312 2.320312 -4.511719 L 2.320312 0 L 1.003906 0 Z M 1.003906 -7.84375 "/>
+</g>
+<g id="glyph-0-5">
+<path d="M 1.609375 -7.84375 L 3.707031 -1.457031 L 5.894531 -7.84375 L 7.339844 -7.84375 L 4.378906 0 L 2.972656 0 L 0.0820312 -7.84375 Z M 1.609375 -7.84375 "/>
+</g>
+<g id="glyph-0-6">
+<path d="M 4.234375 -8.019531 C 4.789062 -8.019531 5.328125 -7.890625 5.851562 -7.628906 C 6.375 -7.367188 6.773438 -7.027344 7.046875 -6.613281 C 7.308594 -6.21875 7.484375 -5.757812 7.574219 -5.230469 C 7.652344 -4.867188 7.691406 -4.292969 7.691406 -3.5 L 1.941406 -3.5 C 1.964844 -2.703125 2.152344 -2.066406 2.503906 -1.585938 C 2.855469 -1.105469 3.402344 -0.863281 4.136719 -0.863281 C 4.828125 -0.863281 5.375 -1.089844 5.785156 -1.546875 C 6.019531 -1.808594 6.1875 -2.113281 6.285156 -2.460938 L 7.582031 -2.460938 C 7.546875 -2.171875 7.433594 -1.851562 7.238281 -1.496094 C 7.046875 -1.144531 6.832031 -0.855469 6.59375 -0.628906 C 6.191406 -0.238281 5.695312 0.0234375 5.105469 0.160156 C 4.789062 0.238281 4.429688 0.277344 4.027344 0.277344 C 3.050781 0.277344 2.222656 -0.078125 1.546875 -0.789062 C 0.867188 -1.5 0.527344 -2.492188 0.527344 -3.773438 C 0.527344 -5.03125 0.867188 -6.054688 1.554688 -6.839844 C 2.238281 -7.625 3.128906 -8.019531 4.234375 -8.019531 Z M 6.335938 -4.546875 C 6.28125 -5.117188 6.15625 -5.578125 5.960938 -5.917969 C 5.601562 -6.554688 4.996094 -6.871094 4.152344 -6.871094 C 3.546875 -6.871094 3.039062 -6.652344 2.628906 -6.214844 C 2.21875 -5.777344 2 -5.222656 1.976562 -4.546875 Z M 6.335938 -4.546875 "/>
+</g>
+<g id="glyph-0-7">
+<path d="M 1.230469 -10.035156 L 2.5625 -10.035156 L 2.5625 -7.84375 L 3.816406 -7.84375 L 3.816406 -6.765625 L 2.5625 -6.765625 L 2.5625 -1.648438 C 2.5625 -1.375 2.65625 -1.191406 2.84375 -1.097656 C 2.945312 -1.042969 3.117188 -1.019531 3.355469 -1.019531 C 3.417969 -1.019531 3.484375 -1.019531 3.558594 -1.023438 C 3.632812 -1.023438 3.71875 -1.03125 3.816406 -1.039062 L 3.816406 0 C 3.664062 0.0429688 3.507812 0.0742188 3.34375 0.09375 C 3.179688 0.113281 3.003906 0.125 2.8125 0.125 C 2.195312 0.125 1.78125 -0.03125 1.558594 -0.347656 C 1.339844 -0.664062 1.230469 -1.070312 1.230469 -1.574219 L 1.230469 -6.765625 L 0.167969 -6.765625 L 0.167969 -7.84375 L 1.230469 -7.84375 Z M 1.230469 -10.035156 "/>
+</g>
+<g id="glyph-0-8">
+<path d="M 4.078125 -0.851562 C 4.953125 -0.851562 5.554688 -1.179688 5.878906 -1.84375 C 6.203125 -2.503906 6.363281 -3.238281 6.363281 -4.050781 C 6.363281 -4.78125 6.246094 -5.378906 6.011719 -5.835938 C 5.640625 -6.558594 5.003906 -6.921875 4.09375 -6.921875 C 3.289062 -6.921875 2.703125 -6.613281 2.335938 -6 C 1.96875 -5.382812 1.789062 -4.640625 1.789062 -3.773438 C 1.789062 -2.9375 1.96875 -2.242188 2.335938 -1.683594 C 2.703125 -1.128906 3.285156 -0.851562 4.078125 -0.851562 Z M 4.132812 -8.070312 C 5.140625 -8.070312 5.996094 -7.734375 6.695312 -7.0625 C 7.394531 -6.386719 7.742188 -5.394531 7.742188 -4.085938 C 7.742188 -2.820312 7.433594 -1.777344 6.820312 -0.953125 C 6.203125 -0.128906 5.25 0.285156 3.953125 0.285156 C 2.875 0.285156 2.019531 -0.078125 1.382812 -0.808594 C 0.75 -1.539062 0.433594 -2.519531 0.433594 -3.75 C 0.433594 -5.070312 0.765625 -6.117188 1.4375 -6.898438 C 2.105469 -7.679688 3.003906 -8.070312 4.132812 -8.070312 Z M 4.132812 -8.070312 "/>
+</g>
+<g id="glyph-0-9">
+<path d="M 0.96875 -7.84375 L 2.269531 -7.84375 L 2.269531 -6.730469 C 2.582031 -7.117188 2.867188 -7.398438 3.121094 -7.574219 C 3.554688 -7.871094 4.046875 -8.019531 4.601562 -8.019531 C 5.226562 -8.019531 5.726562 -7.867188 6.109375 -7.558594 C 6.324219 -7.382812 6.519531 -7.125 6.695312 -6.78125 C 6.988281 -7.203125 7.332031 -7.511719 7.726562 -7.714844 C 8.121094 -7.917969 8.566406 -8.019531 9.058594 -8.019531 C 10.113281 -8.019531 10.832031 -7.640625 11.214844 -6.878906 C 11.417969 -6.46875 11.519531 -5.914062 11.519531 -5.222656 L 11.519531 0 L 10.152344 0 L 10.152344 -5.449219 C 10.152344 -5.972656 10.019531 -6.332031 9.757812 -6.527344 C 9.496094 -6.722656 9.179688 -6.820312 8.804688 -6.820312 C 8.285156 -6.820312 7.839844 -6.644531 7.46875 -6.296875 C 7.09375 -5.953125 6.90625 -5.375 6.90625 -4.5625 L 6.90625 0 L 5.566406 0 L 5.566406 -5.121094 C 5.566406 -5.652344 5.503906 -6.039062 5.375 -6.285156 C 5.175781 -6.652344 4.800781 -6.832031 4.253906 -6.832031 C 3.757812 -6.832031 3.304688 -6.640625 2.898438 -6.253906 C 2.488281 -5.867188 2.285156 -5.171875 2.285156 -4.160156 L 2.285156 0 L 0.96875 0 Z M 0.96875 -7.84375 "/>
+</g>
+<g id="glyph-0-10">
+<path d="M 0.96875 -7.808594 L 2.308594 -7.808594 L 2.308594 0 L 0.96875 0 Z M 0.96875 -10.757812 L 2.308594 -10.757812 L 2.308594 -9.265625 L 0.96875 -9.265625 Z M 0.96875 -10.757812 "/>
+</g>
+<g id="glyph-0-11">
+<path d="M 0.21875 -7.84375 L 1.925781 -7.84375 L 3.726562 -5.082031 L 5.550781 -7.84375 L 7.15625 -7.808594 L 4.511719 -4.019531 L 7.273438 0 L 5.589844 0 L 3.640625 -2.945312 L 1.75 0 L 0.0820312 0 L 2.84375 -4.019531 Z M 0.21875 -7.84375 "/>
+</g>
+<g id="glyph-0-12">
+<path d="M 0.96875 -7.84375 L 2.21875 -7.84375 L 2.21875 -6.730469 C 2.589844 -7.191406 2.984375 -7.519531 3.398438 -7.71875 C 3.8125 -7.917969 4.273438 -8.019531 4.78125 -8.019531 C 5.894531 -8.019531 6.648438 -7.632812 7.039062 -6.855469 C 7.253906 -6.429688 7.359375 -5.824219 7.359375 -5.03125 L 7.359375 0 L 6.019531 0 L 6.019531 -4.945312 C 6.019531 -5.421875 5.949219 -5.808594 5.808594 -6.101562 C 5.574219 -6.589844 5.148438 -6.832031 4.535156 -6.832031 C 4.222656 -6.832031 3.964844 -6.800781 3.765625 -6.738281 C 3.402344 -6.632812 3.085938 -6.414062 2.8125 -6.09375 C 2.59375 -5.835938 2.449219 -5.566406 2.382812 -5.292969 C 2.316406 -5.015625 2.285156 -4.621094 2.285156 -4.109375 L 2.285156 0 L 0.96875 0 Z M 0.96875 -7.84375 "/>
+</g>
+<g id="glyph-0-13">
+<path d="M 0.9375 -10.757812 L 2.203125 -10.757812 L 2.203125 -4.511719 L 5.589844 -7.84375 L 7.273438 -7.84375 L 4.269531 -4.90625 L 7.441406 0 L 5.757812 0 L 3.3125 -3.953125 L 2.203125 -2.945312 L 2.203125 0 L 0.9375 0 Z M 0.9375 -10.757812 "/>
+</g>
+<g id="glyph-0-14">
+<path d="M 0.621094 -4.855469 L 4.300781 -4.855469 L 4.300781 -3.5 L 0.621094 -3.5 Z M 0.621094 -4.855469 "/>
+</g>
+<g id="glyph-0-15">
+<path d="M 3.898438 0.285156 C 2.65625 0.285156 1.757812 -0.0546875 1.199219 -0.734375 C 0.640625 -1.417969 0.359375 -2.246094 0.359375 -3.222656 L 1.734375 -3.222656 C 1.792969 -2.542969 1.921875 -2.050781 2.117188 -1.742188 C 2.457031 -1.191406 3.078125 -0.914062 3.96875 -0.914062 C 4.664062 -0.914062 5.21875 -1.101562 5.640625 -1.472656 C 6.058594 -1.84375 6.269531 -2.320312 6.269531 -2.90625 C 6.269531 -3.628906 6.046875 -4.136719 5.605469 -4.421875 C 5.164062 -4.710938 4.550781 -4.855469 3.765625 -4.855469 C 3.675781 -4.855469 3.585938 -4.855469 3.496094 -4.851562 C 3.40625 -4.851562 3.316406 -4.847656 3.222656 -4.839844 L 3.222656 -6.007812 C 3.359375 -5.992188 3.472656 -5.980469 3.566406 -5.976562 C 3.660156 -5.972656 3.757812 -5.96875 3.867188 -5.96875 C 4.359375 -5.96875 4.765625 -6.046875 5.082031 -6.203125 C 5.640625 -6.476562 5.917969 -6.964844 5.917969 -7.667969 C 5.917969 -8.191406 5.734375 -8.59375 5.359375 -8.875 C 4.988281 -9.160156 4.558594 -9.300781 4.066406 -9.300781 C 3.1875 -9.300781 2.578125 -9.007812 2.242188 -8.421875 C 2.054688 -8.101562 1.949219 -7.640625 1.925781 -7.046875 L 0.621094 -7.046875 C 0.621094 -7.828125 0.777344 -8.492188 1.089844 -9.039062 C 1.628906 -10.015625 2.574219 -10.503906 3.925781 -10.503906 C 4.996094 -10.503906 5.824219 -10.265625 6.410156 -9.789062 C 6.996094 -9.3125 7.289062 -8.625 7.289062 -7.71875 C 7.289062 -7.074219 7.113281 -6.554688 6.765625 -6.152344 C 6.550781 -5.902344 6.273438 -5.707031 5.933594 -5.566406 C 6.484375 -5.414062 6.914062 -5.125 7.226562 -4.691406 C 7.535156 -4.257812 7.691406 -3.730469 7.691406 -3.105469 C 7.691406 -2.105469 7.359375 -1.289062 6.703125 -0.660156 C 6.042969 -0.03125 5.109375 0.285156 3.898438 0.285156 Z M 3.898438 0.285156 "/>
+</g>
+<g id="glyph-0-16">
+<rect x="0" y="0" width="0" height="0" mask="url(#mask-0)"/>
+</g>
+</g>
+<image id="source-90" x="0" y="0" width="0" height="0"/>
+<mask id="mask-0">
+<use xlink:href="#source-90"/>
+</mask>
+</defs>
+<rect x="-48" y="-13" width="576" height="156" fill="rgb(100%, 100%, 100%)" fill-opacity="1"/>
+<path fill="none" stroke-width="2" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 50 60 C 50 71.046875 41.046875 80 30 80 C 18.953125 80 10 71.046875 10 60 C 10 48.953125 18.953125 40 30 40 C 41.046875 40 50 48.953125 50 60 Z M 50 60 "/>
+<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
+<use xlink:href="#glyph-0-0" x="9.993896" y="95.379639"/>
+<use xlink:href="#glyph-0-1" x="17.493896" y="95.379639"/>
+<use xlink:href="#glyph-0-2" x="24.993896" y="95.379639"/>
+<use xlink:href="#glyph-0-3" x="33.336182" y="95.379639"/>
+<use xlink:href="#glyph-0-2" x="36.668701" y="95.379639"/>
+<use xlink:href="#glyph-0-4" x="45.010986" y="95.379639"/>
+</g>
+<path fill="none" stroke-width="2" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 180 60 C 180 71.046875 171.046875 80 160 80 C 148.953125 80 140 71.046875 140 60 C 140 48.953125 148.953125 40 160 40 C 171.046875 40 180 48.953125 180 60 Z M 180 60 "/>
+<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
+<use xlink:href="#glyph-0-5" x="139.576416" y="95.01709"/>
+<use xlink:href="#glyph-0-6" x="147.076416" y="95.01709"/>
+<use xlink:href="#glyph-0-1" x="155.418701" y="95.01709"/>
+<use xlink:href="#glyph-0-7" x="162.918701" y="95.01709"/>
+<use xlink:href="#glyph-0-8" x="167.086182" y="95.01709"/>
+<use xlink:href="#glyph-0-4" x="175.428467" y="95.01709"/>
+</g>
+<path fill="none" stroke-width="2" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 310 60 C 310 71.046875 301.046875 80 290 80 C 278.953125 80 270 71.046875 270 60 C 270 48.953125 278.953125 40 290 40 C 301.046875 40 310 48.953125 310 60 Z M 310 60 "/>
+<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
+<use xlink:href="#glyph-0-9" x="269.58374" y="95.379639"/>
+<use xlink:href="#glyph-0-2" x="282.078857" y="95.379639"/>
+<use xlink:href="#glyph-0-7" x="290.421143" y="95.379639"/>
+<use xlink:href="#glyph-0-4" x="294.588623" y="95.379639"/>
+<use xlink:href="#glyph-0-10" x="299.58374" y="95.379639"/>
+<use xlink:href="#glyph-0-11" x="302.91626" y="95.379639"/>
+</g>
+<path fill="none" stroke-width="2" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 440 60 C 440 71.046875 431.046875 80 420 80 C 408.953125 80 400 71.046875 400 60 C 400 48.953125 408.953125 40 420 40 C 431.046875 40 440 48.953125 440 60 Z M 440 60 "/>
+<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
+<use xlink:href="#glyph-0-4" x="375.812988" y="95.379639"/>
+<use xlink:href="#glyph-0-2" x="380.808105" y="95.379639"/>
+<use xlink:href="#glyph-0-12" x="389.150391" y="95.379639"/>
+<use xlink:href="#glyph-0-13" x="397.492676" y="95.379639"/>
+<use xlink:href="#glyph-0-14" x="404.992676" y="95.379639"/>
+<use xlink:href="#glyph-0-15" x="409.987793" y="95.379639"/>
+<use xlink:href="#glyph-0-16" x="418.330078" y="95.379639"/>
+<use xlink:href="#glyph-0-7" x="422.497559" y="95.379639"/>
+<use xlink:href="#glyph-0-6" x="426.665039" y="95.379639"/>
+<use xlink:href="#glyph-0-12" x="435.007324" y="95.379639"/>
+<use xlink:href="#glyph-0-0" x="443.349609" y="95.379639"/>
+<use xlink:href="#glyph-0-8" x="450.849609" y="95.379639"/>
+<use xlink:href="#glyph-0-4" x="459.191895" y="95.379639"/>
+</g>
+<path fill="none" stroke-width="2" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 160 40 L 160 10 "/>
+<path fill="none" stroke-width="2" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 270 60 L 240 60 "/>
+<path fill="none" stroke-width="2" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 310 60 L 340 60 "/>
+<path fill="none" stroke-width="2" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 400 60 L 370 60 "/>
+<path fill="none" stroke-width="2" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 440 60 L 470 60 "/>
+<path fill="none" stroke-width="2" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 420 40 L 420 10 "/>
+</svg>
diff --git a/docs/src/autodiff.md b/docs/src/autodiff.md
new file mode 100644
index 0000000..53b3473
--- /dev/null
+++ b/docs/src/autodiff.md
@@ -0,0 +1,22 @@
+# Automatic differentiation
+
+There are two ways to compute the gradient of an einsum expression. The first one is to use the `OMEinsum` package, which is a custom implementation of the reverse-mode automatic differentiation. The second one is to use the [`Zygote`](https://github.com/FluxML/Zygote.jl) package, which is a source-to-source automatic differentiation tool.
+
+## Built-in automatic differentiation
+The `OMEinsum` package provides a built-in function [`cost_and_gradient`](@ref) to compute the cost and the gradient of an einsum expression.
+
+```@repl autodiff
+using OMEinsum  # the 1st way
+A, B, C = randn(2, 3), randn(3, 4), randn(4, 2);
+y, g = cost_and_gradient(ein"(ij, jk), ki->", (A, B, C))
+```
+This built-in automatic differentiation is designed for tensor contractions and is more efficient than the general-purpose automatic differentiation tools.
+
+## Using Zygote
+The backward rule for the basic einsum operation is ported to the [`ChainRulesCore`](https://github.com/JuliaDiff/ChainRulesCore.jl), which is used by the `Zygote` package.
+Zygote is a source-to-source automatic differentiation tool that can be used to compute the gradient of an einsum expression.
+It is more general and can be used for any Julia code.
+```@repl autodiff
+using Zygote  # the 2nd way
+Zygote.gradient((A, B, C)->ein"(ij, jk), ki->"(A, B, C)[], A, B, C)
+```
\ No newline at end of file
diff --git a/docs/src/background.md b/docs/src/background.md
new file mode 100644
index 0000000..597558e
--- /dev/null
+++ b/docs/src/background.md
@@ -0,0 +1,65 @@
+# Background Knowledge
+
+## Tensors and Tensor Networks
+Tensor networks serve as a fundamental tool for modeling and analyzing correlated systems. This section reviews the fundamental concepts of tensor
+networks.
+
+A tensor is a mathematical object that generalizes scalars, vectors, and matrices. It can have multiple dimensions and is used to represent data in various mathematical and physical contexts. It is formally defined as follows:
+
+*Definition* (Tensor): A tensor $T$ associated to a set of discrete variables $V$ is defined as a function that maps each possible instantiation of the variables in its scope $\mathcal{D}_V = \prod_{v\in V} \mathcal{D}_{v}$ to an element in the set $\mathcal{E}$, given by
+```math
+T_{V}: \prod_{v \in V} \mathcal{D}_{v} \rightarrow \mathcal{E}.
+```
+Within the context of probabilistic modeling, the elements in $\mathcal{E}$ are non-negative real numbers, while in other scenarios, they can be of generic types. The diagrammatic representation of a tensor is given by a node with the variables $V$ as labels on its edges, as shown below:
+
+```@raw html
+<img src="../assets/tensors.svg" width=500 style="margin-left:auto; margin-right:auto; display:block"/>
+```
+
+*Definition* (Tensor Network): A tensor network is a mathematical framework for defining multilinear maps, which can be represented by a triple $\mathcal{N} = (\Lambda, \mathcal{T}, V_0)$, where:
+*  $\Lambda$ is the set of variables present in the network $\mathcal{N}$.
+*  $\mathcal{T} = \{ T_{V_k} \}_{k=1}^{K}$ is the set of input tensors, where each tensor $T_{V_k}$ is associated with the labels $V_k$.
+*  $V_0$ specifies the labels of the output tensor.
+
+Specifically, each tensor $T_{V_k} \in \mathcal{T}$ is labeled by a set of variables $V_k \subseteq \Lambda$, where the cardinality $|V_k|$ equals the rank of $T_{V_k}$. The multilinear map, or the **contraction**, applied to this triple is defined as
+```math
+T_{V_0} = \texttt{contract}(\Lambda, \mathcal{T}, V_0) \overset{\mathrm{def}}{=} \sum_{m \in \mathcal{D}_{\Lambda\setminus V_0}} \prod_{T_V \in \mathcal{T}} T_{V|M=m},
+```
+where $M = \Lambda \setminus V_0$. $T_{V|M=m}$ denotes a slicing of the tensor $T_{V}$ with the variables $M$ fixed to the values $m$. The summation runs over all possible configurations of the variables in $M$.
+
+For instance, matrix multiplication can be described as the contraction of a tensor network given by
+```math
+(AB)_{\{i, k\}} = \texttt{contract}\left(\{i,j,k\}, \{A_{\{i, j\}}, B_{\{j, k\}}\}, \{i, k\}\right),
+```
+where matrices $A$ and $B$ are input tensors containing the variable sets $\{i, j\}, \{j, k\}$, respectively, which are subsets of $\Lambda = \{i, j, k\}$. The output tensor is comprised of variables $\{i, k\}$ and the summation runs over variables $\Lambda \setminus \{i, k\} = \{j\}$. The contraction corresponds to
+```math
+(A B)_{\{i, k\}} = \sum_j A_{\{i,j\}}B_{\{j, k\}}.
+```
+
+Diagrammatically, a tensor network can be represented as an *open hypergraph*, where each tensor is mapped to a vertex and each variable is mapped to a hyperedge. Two vertices are connected by the same hyperedge if and only if they share a common variable. The diagrammatic representation of the matrix multiplication is given as follows: 
+
+```@raw html
+<img src="../assets/matmul.png" width=500 style="margin-left:auto; margin-right:auto; display:block"/>
+```
+
+Here, we use different colors to denote different hyperedges. Hyperedges for $i$ and $k$ are left open to denote variables of the output tensor. A slightly more complex example of this is the star contraction:
+```math
+\texttt{contract}(\{i,j,k,l\}, \{A_{\{i, l\}}, B_{\{j, l\}}, C_{\{k, l\}}\}, \{i,j,k\}) \\
+= \sum_{l}A_{\{i,l\}} B_{\{j,l\}} C_{\{k,l\}}.
+```
+Note that the variable $l$ is shared by all three tensors, making regular edges, which by definition connect two nodes, insufficient for its representation. This motivates the need for hyperedges, which can connect a single variable to any number of nodes. The hypergraph representation is given as:
+
+```@raw html
+<img src="../assets/starcontract.png" width=500 style="margin-left:auto; margin-right:auto; display:block"/>
+```
+
+## Einsum notation
+The einsum notation is a compact way to specify tensor contractions with a string. In this notation, an index (subscripts) is represented by a char, and the tensors are represented by the indices. The input tensors and the output tensor are separated by an arrow `->` and input tensors are separated by comma `,`. For example, the matrix multiplication $\left(\{i,j,k\}, \{A_{\{i, j\}}, B_{\{j, k\}}\}, \{i, k\}\right)$ can be concisely written as `"ij,jk->ik"`. A general contraction can be defined with pseudocode as follows:
+```
+Let A, B, C, ... be input tensors, O be the output tensor
+for indices in domain_of_unique_indices(einsum_notation)
+    O[indices in O] += A[indices in A] * B[indices in B] * ...
+end
+```
+
+Please [Einsum examples](@ref) for some examples.
\ No newline at end of file
diff --git a/docs/src/basic.md b/docs/src/basic.md
new file mode 100644
index 0000000..91bd7af
--- /dev/null
+++ b/docs/src/basic.md
@@ -0,0 +1,65 @@
+# Basic Usage
+
+In the following example, we demonstrate the einsum notation for basic tensor operations.
+
+## Einsum notation
+To specify the operation, the user can either use the [`@ein_str`](@ref)-string literal or the [`EinCode`](@ref) object.
+For example, both the following code snippets define the matrix multiplication operation:
+```@repl tensor
+using OMEinsum
+code1 = ein"ij,jk -> ik"  # the string literal
+ixs = [[1, 2], [2, 3]]  # the input indices
+iy = [1, 3]  # the output indices
+EinCode(ixs, iy)  # the EinCode object (equivalent to the string literal)
+```
+
+The [`@ein_str`](@ref) macro can be used to define the einsum notation directly in the function call.
+```@repl tensor
+A, B = randn(2, 3), randn(3, 4);
+ein"ij,jk -> ik"(A, B)  # matrix multiplication
+@ein C[i,k] := A[i,j] * B[j,k]  # equivalent to the above line
+```
+Here, we show that the [`@ein`](@ref) macro combines the einsum notation defintion and the operation in a single line, which is more convenient for simple operations.
+Separating the einsum notation and the operation (the first approach) can be useful for reusing the einsum notation for multiple input tensors.
+
+For more than two input tensors, *the [`@ein_str`](@ref) macro does not optimize the contraction order*. In such cases, the user can use the [`@optein_str`](@ref) string literal to optimize the contraction order.
+```@repl tensor
+optein"ij,jk,kl,lm->im"(randn(100, 100), randn(100, 100), randn(100, 100), randn(100, 100))
+```
+
+Sometimes, manually optimizing the contraction order can be beneficial. Please check [Contraction order optimization](@ref) for more details.
+
+## Einsum examples
+We first define the tensors and then demonstrate the einsum notation for various tensor operations.
+```@repl tensor
+using OMEinsum
+s = fill(1)  # scalar
+w, v = [1, 2], [4, 5];  # vectors
+A, B = [1 2; 3 4], [5 6; 7 8]; # matrices
+T1, T2 = reshape(1:8, 2, 2, 2), reshape(9:16, 2, 2, 2); # 3D tensor
+```
+### Unary examples
+```@repl tensor
+ein"i->"(w)  # sum of the elements of a vector.
+ein"ij->i"(A)  # sum of the rows of a matrix.
+ein"ii->"(A)  # sum of the diagonal elements of a matrix, i.e., the trace.
+ein"ij->"(A)  # sum of the elements of a matrix.
+ein"i->ii"(w)  # create a diagonal matrix.
+ein"i->ij"(w; size_info=Dict('j'=>2))  # repeat a vector to form a matrix.
+ein"ijk->ikj"(T1)  # permute the dimensions of a tensor.
+```
+
+### Binary examples
+```@repl tensor
+ein"ij, jk -> ik"(A, B)  # matrix multiplication.
+ein"ijb,jkb->ikb"(T1, T2)  # batch matrix multiplication.
+ein"ij,ij->ij"(A, B)  # element-wise multiplication.
+ein"ij,ij->"(A, B)  # sum of the element-wise multiplication.
+ein"ij,->ij"(A, s)  # element-wise multiplication by a scalar.
+```
+
+### Nary examples
+```@repl tensor
+optein"ai,aj,ak->ijk"(A, A, B)  # star contraction.
+optein"ia,ajb,bkc,cld,dm->ijklm"(A, T1, T2, T1, A)  # tensor train contraction.
+```
\ No newline at end of file
diff --git a/docs/src/contractionorder.md b/docs/src/contractionorder.md
index 428fcdb..b035257 100644
--- a/docs/src/contractionorder.md
+++ b/docs/src/contractionorder.md
@@ -1,33 +1,49 @@
 # Contraction order optimization
 
-OMEinsum does not implicitly optimize the contraction order.
-Functionalities related to contraction order optimization are mostly defined in [OMEinsumContractionOrders](https://github.com/TensorBFS/OMEinsumContractionOrders.jl)
+The [`@ein_str`](@ref) string literal does not optimize the contraction order for more than two input tensors.
 
-Here, we provide an example, advanced uses can be found in [OMEinsumContractionOrders](https://github.com/TensorBFS/OMEinsumContractionOrders.jl) and the [performance tips](https://queracomputing.github.io/GenericTensorNetworks.jl/dev/performancetips/) of [GenericTensorNetworks](https://github.com/QuEraComputing/GenericTensorNetworks.jl).
-Let us first consider the following contraction order
-
-```@example 3
+```@repl order
 using OMEinsum
 
 code = ein"ij,jk,kl,li->"
 ```
 
 The time and space complexity can be obtained by calling the [`contraction_complexity`](@ref) function.
-```@example 3
-size_dict = uniformsize(code, 10)
+```@repl order
+size_dict = uniformsize(code, 10)  # size of the labels are set to 10
 
-contraction_complexity(code, size_dict)
+contraction_complexity(code, size_dict)  # time and space complexity
 ```
 
 The return values are `log2` values of the number of iterations, number of elements of the largest tensor and the number of elementwise read-write operations.
 
-```@example 3
+## Optimizing the contraction order
+To optimize the contraction order, we can use the [`optimize_code`](@ref) function.
+
+```@repl order
 optcode = optimize_code(code, size_dict, TreeSA())
 ```
 
-The output value is a binary contraction tree with type [`NestedEinsum`](@ref) type.
-The time and readwrite complexities are significantly reduced comparing to the direct contraction.
+The output value is a binary contraction tree with type [`SlicedEinsum`](@ref) or [`NestedEinsum`](@ref).
+The `TreeSA` is a local search algorithm that optimizes the contraction order. More algorithms can be found in the
+[OMEinsumContractionOrders](https://github.com/TensorBFS/OMEinsumContractionOrders.jl) and the [performance tips](https://queracomputing.github.io/GenericTensorNetworks.jl/dev/performancetips/) of [GenericTensorNetworks](https://github.com/QuEraComputing/GenericTensorNetworks.jl).
 
-```@example 3
+After optimizing the contraction order, the time and readwrite complexities are significantly reduced.
+
+```@repl order
 contraction_complexity(optcode, size_dict)
+```
+
+## Using `optein` string literal
+For convenience, the optimized contraction can be directly contructed by using the [`@optein_str`](@ref) string literal.
+```@repl order
+optein"ij,jk,kl,li->"  # optimized contraction, without knowing the size of the tensors
+```
+The drawback of using `@optein_str` is that the contraction order is optimized without knowing the size of the tensors.
+Only the tensor ranks are used to optimize the contraction order.
+
+## Manual optimization
+One can also manually specify the contraction order by using the [`@ein_str`](@ref) string literal.
+```@repl order
+ein"((ij,jk),kl),li->ik"  # manually optimized contraction
 ```
\ No newline at end of file
diff --git a/docs/src/cuda.md b/docs/src/cuda.md
new file mode 100644
index 0000000..ca264d8
--- /dev/null
+++ b/docs/src/cuda.md
@@ -0,0 +1,52 @@
+# CUDA Acceleration
+
+By uploading your data to the GPU, you can accelerate the computation of your model.
+
+```julia repl
+julia> using CUDA, OMEinsum
+
+julia> code = ein"ij,jk,kl,li->"  # the einsum notation
+ij, jk, kl, li -> 
+
+julia> A, B, C, D = rand(1000, 1000), rand(1000, 300), rand(300, 800), rand(800, 1000);
+
+julia> size_dict = OMEinsum.get_size_dict(getixsv(code), (A, B, C, D))  # get the size of the labels
+Dict{Char, Int64} with 4 entries:
+  'j' => 1000
+  'i' => 1000
+  'k' => 300
+  'l' => 800
+
+julia> optcode = optimize_code(code, size_dict, TreeSA())  # optimize the contraction order
+SlicedEinsum{Char, DynamicNestedEinsum{Char}}(Char[], kl, kl -> 
+├─ ki, li -> kl
+│  ├─ jk, ij -> ki
+│  │  ├─ jk
+│  │  └─ ij
+│  └─ li
+└─ kl
+)
+```
+
+The contraction order is optimized. Now, let's benchmark the contraction on the CPU.
+
+```julia repl
+julia> using BenchmarkTools
+
+julia> @btime optcode($A, $B, $C, $D)  # the contraction on CPU
+  6.053 ms (308 allocations: 20.16 MiB)
+0-dimensional Array{Float64, 0}:
+1.4984046443610943e10
+```
+
+The contraction on the CPU takes about 6 ms. Now, let's upload the data to the GPU and perform the contraction on the GPU.
+```julia repl
+julia> @btime CUDA.@sync optcode($cuA, $cuB, $cuC, $cuD)  # the contraction on GPU
+  243.888 μs (763 allocations: 28.56 KiB)
+0-dimensional CuArray{Float64, 0, CUDA.DeviceMemory}:
+1.4984046443610939e10
+```
+
+To learn more about using GPU and autodiff, please check out the following asciinema video.
+[![asciicast](https://asciinema.org/a/wE4CtIzWUC3R0GkVV28rVBRFb.svg)](https://asciinema.org/a/wE4CtIzWUC3R0GkVV28rVBRFb)
+
diff --git a/docs/src/extending.md b/docs/src/extending.md
deleted file mode 100644
index 24a859d..0000000
--- a/docs/src/extending.md
+++ /dev/null
@@ -1,48 +0,0 @@
-# Extending OMEinsum
-
-Adding a new subtype of `EinRule` is bothersome - the list of rules
-that's considered needs to be fix and thus one has to change the code before
-`using` OMEinsum. A limitation due to liberal use of `generated` functions.
-If a useful rule is found, we might add it to the package itself though so feel free to reach out.
-
-Extending `einsum` for certain array-types on the other hands is easy,
-since we use the usual dispatch mechanism.
-Consider e.g. adding a special operator for index-reductions of a `Diagonal`-operator.
-
-First, we need to add a method for the `asarray`-function that ensures that we return 0-dimensional arrays for operations.
-
-```@example 1
-using OMEinsum, LinearAlgebra
-
-OMEinsum.asarray(a::Number, ::Diagonal) = fill(a,())
-```
-
-Now reducing over indices already works but it uses the `sum` function
-which does not specialize on `Diagonal`:
-```@example 1
-ein"ij -> "(Diagonal([1,2,3]))
-```
-
-we can do better by overloading the unary rule `einsum(::Sum, ixs, iy, ::Tuple{<:Diagonal}, <:Any)`:
-```@example 1
-function OMEinsum.einsum(::OMEinsum.Sum, ixs, iy, xs::Tuple{<:Diagonal}, size_dict::Dict)
-    length(iy) == 1 && return diag(xs[1])
-    return sum(diag(xs[1]))
-end
-```
-
-where we use that the indices `iy` and `ixs` have already been checked in `match_rule`.
-We now get our more efficient implementation when we call any of the below:
-```@example 1
-ein"ij -> i"(Diagonal([1,2,3]))
-```
-
-```@example 1
-ein"ij -> j"(Diagonal([1,2,3]))
-```
-
-```@example 1
-ein"ij -> "(Diagonal([1,2,3]))
-```
-
-(To make sure the custom implementation is called, you can add a `print`-statement to the method for `Diagonal`)
diff --git a/docs/src/implementation.md b/docs/src/implementation.md
deleted file mode 100644
index b6015cd..0000000
--- a/docs/src/implementation.md
+++ /dev/null
@@ -1,107 +0,0 @@
-# Implementations
-
-## Identity
-To test whether a specification `ixs,iy` is the identity, it is checked whether
-`ixs` is made up of _one_ tuple of index-labels that is equal to `iy` _and_
-that all index-labels in `iy` are unique - the latter to distinguish identity
-from e.g. projection to the diagonal like `ein"ii -> ii"`.
-
-The identity operation simply returns the first (and only) tensor argument to `einsum`.
-
-## Permutations
-
-A specification `ixs,iy` is an index-permutation if `ixs` is a tuple containing
-one tuple of index-labels that are all unique and are a permutation of the labels
-in `iy`.
-
-Index-permutation is implemented with `permutedims` and a permutation that's calculated
-at runtime.
-
-## Tr
-
-A specification `ixs, iy` is a trace if `iy` is empty and `ixs` contains one
-2-tuple containing the same index-label twice.
-
-A trace dispatches to the `LinearAlgebra.tr` although the result is wrapped in
-a 0-dimensional array for type stability since all `einsum` return `AbstractArray`s.
-
-## Sum
-
-A specification `ixs,iy` is a sum or a reduction over indices if all indices in `iy`
-are unique and contained in the only tuple in `ixs` that additionally contains
-unique labels (that are reduced over).
-
-Index-reductions are implemented using `Base.sum` and `Base.dropdims` - the latter
-to remove the singleton-dimensions left over after summing over a dimension.
-
-## Repeat
-The inverse rule of `Sum`, e.g. `ij->lijk`.
-
-## Diag
-A unary operation that remove multi-edges from a tensor, e.g. `ijkj->ikj`.
-
-## Duplicate
-The inverse rule of `Diag`, e.g. `ikj->ijkj`.
-
-## SimpleBinaryRule
-The contraction between two tensors with the following restriction
-* a tensor can not be simplified by unary rules, e.g. `iij,jk,ik` is not valid, the first index can be simplified to `ij` using the unary rule `iij->ij`.
-* no multi-edge
-
-A complete list of rules are
-* ein",->"
-* ein",k->k"
-* ein"i,->i"
-* ein"j,j->"
-* ein"i,k->ik" and ein"i,k->ki",
-* ein"j,jk->k" and ein"j,kj->k"
-* ein"ji,j->i" and ein"ij,j->i"
-* ein"ji,jk->ik" and its index permutations (within a tensor)
-* ein"l,l->l"
-* ein"l,kl->kl"
-* ein"il,->il"
-* ein"jl,jl->"
-* ein"il,kl->ikl" and ein"il,kl->kil",
-* ein"jl,jkl->kl" and ein"jl,kjl->kl"
-* ein"jil,jl->il" and ein"ijl,jl->il"
-* ein"jil,jkl->ikl" and its index permutations (within a tensor, except the batch dimension)
-
-Here, the batch dimension always appears as the last dimension.
-
-## Fallback
-
-The fallback is called for any specification that does not satisfy the criteria
-outlined above.
-
-The dispatch calls `loop_einsum` which is defined in `loop_einsum.jl`.
-
-`loop_einsum` is based on the `EinArray`-struct.
-An `EinArray` is a subtype of `AbstractArray` that represents an intermediate
-step in a general einsum-expression _before_ reductions remove indices.
-Consider a specification `ixs,iy` - the `EinArray` for that specification is
-the array with an index for each (distinct) label in `ixs` and `iy`.
-As an example, in `ein"ij,ik,il -> jkl"(a,b,c)`, the distinct labels are `(i,j,k,l)`
-and the corresponding `EinArray` `einarr` would be a rank-4 tensor with an index each for
-each distinct label.
-
-If an entry of `einarr` is requested, e.g. `einarr[i₁,j₁,k₁,l₁]`, it's values is lazily
-constructed as `einarr[i₁,j₁,k₁,l₁] = a[i₁,j₁]*a[i₁,k₁]*a[i₁,l₁]` upon access - the lazy evaluation avoids constructing the whole array.
-
-To get to the final result, we reduce over the dimensions that are missing in
-the output. By first allocating an array of the correct size, we can fill it
-up with the entries of the `EinArray` which are calculated on the fly,
-avoiding the allocation of the intermediate result.
-
-Thus effectively we split an operation like `ein"ij,ik,il -> jkl"(a,b,c)` into
-two piece: `einarr = ein"ij,ik,il -> ijkl"(a,b,c)` and `ein"ijkl -> jkl"(einarr)`
-but treat the first operation as a lazy one - this way we can use `mapreduce(identity, +)`
-over the dimensions we want to remove which is implemented efficiently for both
-regular `Array`s and `CuArray`s.
-
-## Debugging
-
-Calling `allow_loops(false)` will cause an error to be printed when if the 
-fallback `loop_einsum` is used. This is an `@error` which does not interrupt execution. 
-
-Alternatively, a log of all methods used can be saved using `@debug` logging macro. 
-This is switched off by default, but can be printed by setting `ENV["JULIA_DEBUG"] = "all"`.
diff --git a/docs/src/index.md b/docs/src/index.md
index 431f451..e20cbe8 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -1,38 +1,16 @@
 # OMEinsum.jl
 
-This package mainly exports one function, `einsum`, with three interfaces.
-`einsum` implements functionality similar to the `einsum` function in `numpy`,
-although some details are different.
+This package provides
+- The einsum notation, which is similar to the einsum function in `numpy`, although some details are different.
+- Highly optimized algorithms to optimize the contraction of tensors.
 
-`einsum` operations are specified by a tuple of tensors `xs = (x1, x2, x3...)`
-, a tuple of index-labels for the tensors in `xs`, `ixs = (ix1, ix2, ix3...)`,
-and output index-labels `iy` specified as `einsum(EinCode(ixs,iy), xs)`.
-Alternatively, operations can be specified using the `@ein`-macro or
-the `@ein_str`- string literal (see examples or help).
+The source code is available at [OMEinsum.jl](https://github.com/under-Peter/OMEinsum.jl).
 
-Let `l` be the set of all unique labels in the `ixs` without the ones in `iy`.
-`einsum` then calculates an output tensor `y` with indices labelled `iy` according
-to the following specification:
-```math
-\forall iy : y[iy] = \sum_l x_1[ix_1] * x_2[ix_2] * x_3[ix_3] \ldots
-```
+## Quick start
 
-where the sum over `l` implies the sum over all possible values of the labels in `l`.
+You can find a set up guide in the [README](https://github.com/under-Peter/OMEinsum.jl). To get started, open a Julia REPL and type the following code.
 
-As an example, consider the _matrix multiplication_ of two random 2×2 tensors, where we have:
-```julia
-xs = (rand(2,2), rand(2,2))
-ixs = (('i','j'),('j','k'))
-iy = ('i','k')
-```
-Now `l = ('j',)` since all unique indices are `('i','j','k')`
-but both `'i'` and `'k'` are in `iy`.
-The output `y` is then defined by
-```math
-\forall i,k : y[i,k] = \sum_j x_1[i,j] * x_2[j,k]
-```
-which is just the regular definition of matrix multiplication. Alternatively it could've been specified with a custom string-literal as `ein"ij,jk -> ik"(rand(2,2),rand(2,2))`, see [Input (flat)](@ref).
-
-The structure of an `einsum` evaluation with the string-literal is depicted
-in the flowchart below:
-![](ome-flowchart.png)
+```@repl intro
+using OMEinsum
+optein"ij,jk,kl,lm->im"(randn(100, 100), randn(100, 100), randn(100, 100), randn(100, 100))
+```
\ No newline at end of file
diff --git a/docs/src/ome-flowchart.png b/docs/src/ome-flowchart.png
deleted file mode 100644
index 0bceac0..0000000
Binary files a/docs/src/ome-flowchart.png and /dev/null differ
diff --git a/docs/src/parsing.md b/docs/src/parsing.md
deleted file mode 100644
index 7105e33..0000000
--- a/docs/src/parsing.md
+++ /dev/null
@@ -1,102 +0,0 @@
-# Input (flat)
-
-An einsum specification should be given via the `ein_str` string-literal
-or with the `@ein`-macro as e.g.
-```@example 2
-using OMEinsum
-a, b = randn(2, 2), randn(2, 2)
-
-c = ein"ij,jk -> ik"(a,b)
-@ein c[i,k] := a[i,j] * b[j,k]
-```
-where both specifications encode the same operation - a matrix multiplication.
-The `ein_str`-literal is parsed directly into an `EinCode` struct that holds
-the indices of the input `ixs = (('i','j'),('j','k'))` and output `iy = ('i','k')`
-as type parameters, making them accessible at compile time.
-
-The string-literal form gets turned into
-```@example 2
-c = EinCode((('i','j'),('j','k')),('i','k'))(a,b)
-```
-Calling an `EinCode`-object gets lowered to
-```@example 2
-c = einsum(EinCode((('i','j'),('j','k')),('i','k')), (a,b), Dict('i'=>2, 'j'=>2, 'k'=>2))
-```
-The third argument `size_dict` is a dictionary to specify the dimensions of degree of freedoms, which could also allow to provide dimensions for index-labels that only appear in the output.
-
-In the next step, a singleton-subtype of the abstract type `EinRule` is chosen which is later used for dispatch.
-Subtypes of `EinRule` specify the kind of operation and are created in such a way that they allow useful dispatch.
-They are defined in `EinRule.jl`.
-
-The possible types are:
-- `Identity` - operation is the identity on _one_ tensor, e.g. `ein"ijk -> ijk"`
-- `Permutedims` - operation is a permutation of the indices of _one_ tensor, e.g. `ein"ijk -> jki"`
-- `Tr` - operation is a trace of _one_ matrix, e.g. `ein"ii ->"`
-- `Sum` - operation is a reduction over one or more indices of _one_ tensor, e.g. `ein"ijkl -> il"`
-- `SimpleBinaryRule` - operation is a pairwise contraction that can not be reduce by unary operations, e.g. `ein"ijl,jkl-> ikl"`
-- `DefaultRule` - default if none of the above match, e.g. `ein"ij,ik,il -> jkl"`
-
-Since `ixs` and `iy` are saved as type-parameters, the operation-matching can happen at compile time.
-The operation is chosen using `match_rule(ixs,iy)` by testing all subtypes of `EinRule` in the sequence above (top to bottom) and picking the first match.
-
-This enables us to chose fast BLAS functions for a  matrix multiplication which is also a legal tensor-contraction.
-
-We proceed by calling `einsum(<:EinRule, <:EinCode, xs, size_dict)` which
-dispatches on the `EinRule` and the type of `xs` - the latter enables us to dispatch to e.g. cuda-specific routines for certain operations (as done in the `cueinsum.jl` file).
-
-In the case of the matrix-multiplication above, `einsum` calls `*` which can dispatch
-to efficient routines for most `Array`-types including `CuArray`.
-
-# Input (Nested)
-
-Whether with the `ein_str` string-literal or the `@ein` macro, nested expressions are mapped to a nested struct.
-Consider the example
-```@example 2
-c = ein"(ij,jk),kl -> il"(a,b,c)
-@ein c[i,l] := (a[i,j] * b[j,k]) * c[k,l]
-```
-which is a simply a product of three matrices evaluated as
-two matrix products in sequence.
-
-This is equivalent to
-```@example 2
-c = ein"ik,kl -> il"(ein"ij,jk -> ik"(a,b),c)
-@ein ab[i,k] := a[i,j] * b[j,k]
-@ein c[i,l] := ab[i,k] * c[k,l]
-```
-and is expressed as a nested structure `NestedEinsum`
-which contains the `EinCode`s for the intermediate calculations
-as well as some logic to assign the correct input and output tensors
-to the correct `EinCode`.
-
-`NestedEinsum` has the following definition:
-```@example 2
-struct NestedEinsum
-    args
-    eins
-end
-```
-`args` holds the arguments to that `EinCode` which can either be a integer to label a tensor or a `NestedEinsum` itself.
-The labeling works such that the `i`th input is represented by the number `i`.
-
-Upon application to tensors, a `NestedEinsum` evaluates its arguments.
-If the argument is an integer `i`, the `i`th provided tensor is chosen,
-otherwise the `NestedEinsum` is evaluated.
-
-To make it more concrete, consider the `NestedEinsum` for the expression above, where for easier reading the type signatures were removed and the `EinCode`-structs were replaced by `ein`-string literals.
-```@example 2
-ein"(ij,jk),kl -> il"
-```
-Evaluating this expression with three arguments leads to the inner `NestedEinsum` to be evaluated first with the first and second argument and the specification `ein"ij,jk -> ik"`. Then the result of that is given
-as the first argument to `ein"ik,kl -> il"` with the third argument as the second input.
-
-To improve understanding, you might replace the integers with `getindex` operations in your head
-```julia
-ein"(ij,jk),kl -> il"(xs...)
-⇒ NestedEinsum{...}((NestedEinsum{...}((xs[1], xs[2]), ein"ij,jk -> ik"), xs[3]), ein"ik,kl -> il")
-```
-and finally turn it into
-```julia
-ein"(ij,jk),kl -> il"(xs...)
-⇒ ein"ik,kl -> il"(ein"ij,jk -> ik"(xs[1], xs[2]), xs[3])
-```
\ No newline at end of file
diff --git a/examples/random_tn.jl b/examples/random_tn.jl
deleted file mode 100644
index 48b5b3f..0000000
--- a/examples/random_tn.jl
+++ /dev/null
@@ -1,58 +0,0 @@
-using OMEinsum
-using StatsBase, Random
-
-function test()
-    for i=1:50
-        ranka = rand(1:8)
-        rankb = rand(1:8)
-        ta = [1:ranka...]
-        rankab = rand(1:min(ranka, rankb))
-        tb = sample(ta, rankab; replace=false)
-        tout = setdiff(ta, tb)
-        for k=1:rankb-rankab
-            push!(tb, ranka+k)
-            push!(tout, ranka+k)
-        end
-        shuffle!(tb)
-        shuffle!(tout)
-        A = randn(fill(2, ranka)...)
-        B = randn(fill(2, rankb)...)
-        OMEinsum.batched_contract(Val((ta...,)), A, Val((tb...,)), B, Val((tout...,)))
-    end
-end
-
-@time test()
-
-function batched_contract2(iAs, A::AbstractArray, iBs, B::AbstractArray, iOuts)
-    pA, iAps, iAbs, iAss, pB, iBps, iBbs, iBss, pOut = OMEinsum.analyse_batched_dim(iAs, iBs, iOuts)
-    sAb, sAs, sAp, sBs, sBb, sBp, sAB = OMEinsum.analyse_batched_size(iAs, iAps, iAbs, iAss, size(A), iBs, iBps, iBbs, iBss, size(B))
-
-    A, B = OMEinsum.align_eltypes(A, B)
-    Apr = reshape(OMEinsum.tensorpermute(A, pA), sAb, sAs, sAp)
-    Bpr = reshape(OMEinsum.tensorpermute(B, pB), sBs, sBb, sBp)
-    AB = OMEinsum._batched_gemm('N','N', Apr, Bpr)
-    AB = OMEinsum.tensorpermute(reshape(AB, sAB...), (pOut...,))
-end
-
-function test2()
-    for i=1:10
-        ranka = rand(1:8)
-        rankb = rand(1:8)
-        ta = [1:ranka...]
-        rankab = rand(1:min(ranka, rankb))
-        tb = sample(ta, rankab; replace=false)
-        tout = setdiff(ta, tb)
-        for k=1:rankb-rankab
-            push!(tb, ranka+k)
-            push!(tout, ranka+k)
-        end
-        shuffle!(tb)
-        shuffle!(tout)
-        A = randn(fill(2, ranka)...)
-        B = randn(fill(2, rankb)...)
-        batched_contract2(ta, A, tb, B, tout)
-    end
-end
-
-
-@time test2()
\ No newline at end of file
diff --git a/src/OMEinsum.jl b/src/OMEinsum.jl
index ff90099..e617382 100644
--- a/src/OMEinsum.jl
+++ b/src/OMEinsum.jl
@@ -13,6 +13,7 @@ export getiyv, getixsv, uniquelabels, labeltype
 export flop
 export loop_einsum, loop_einsum!, allow_loops
 export asarray, asscalar
+export cost_and_gradient
 
 # re-export the functions in OMEinsumContractionOrders
 export CodeOptimizer, CodeSimplifier,
diff --git a/src/slicing.jl b/src/slicing.jl
index 18beb57..7c7570f 100644
--- a/src/slicing.jl
+++ b/src/slicing.jl
@@ -1,3 +1,12 @@
+"""
+    SlicedEinsum{LT, Ein} <: AbstractEinsum
+
+A tensor network with slicing. `LT` is the label type and `Ein` is the tensor network.
+
+### Fields
+- `slicing::Vector{LT}`: A vector of labels to slice.
+- `eins::Ein`: The tensor network.
+"""
 struct SlicedEinsum{LT, Ein} <: AbstractEinsum
     slicing::Vector{LT}
     eins::Ein