diff --git a/chai_lab/data/dataset/msas/colabfold.py b/chai_lab/data/dataset/msas/colabfold.py
new file mode 100644
index 0000000..ffddbbf
--- /dev/null
+++ b/chai_lab/data/dataset/msas/colabfold.py
@@ -0,0 +1,412 @@
+# Copyright (c) 2024 Chai Discovery, Inc.
+# Licensed under the Apache License, Version 2.0.
+# See the LICENSE file for details.
+
+import logging
+import os
+import random
+import tarfile
+import tempfile
+import time
+import typing
+from pathlib import Path
+
+import pandas as pd
+import requests
+from tqdm import tqdm
+
+from chai_lab.data.parsing.fasta import read_fasta
+from chai_lab.data.parsing.msas.aligned_pqt import expected_basename, hash_sequence
+
+logger = logging.getLogger(__name__)
+
+TQDM_BAR_FORMAT = (
+    "{l_bar}{bar}| {n_fmt}/{total_fmt} [elapsed: {elapsed} remaining: {remaining}]"
+)
+
+
+# N.B. this code is copied from https://github.com/sokrypton/ColabFold
+# and follows the license in that repository
+@typing.no_type_check  # Original ColabFold code was not well typed
+def _run_mmseqs2(
+    x,
+    prefix,
+    use_env=True,
+    use_filter=True,
+    use_templates=False,
+    filter=None,
+    use_pairing=False,
+    pairing_strategy="greedy",
+    host_url="https://api.colabfold.com",
+    user_agent: str = "",
+) -> list[str] | tuple[list[str], list[str]]:
+    submission_endpoint = "ticket/pair" if use_pairing else "ticket/msa"
+
+    headers = {}
+    if user_agent != "":
+        headers["User-Agent"] = user_agent
+    else:
+        logger.warning(
+            "No user agent specified. Please set a user agent (e.g., 'toolname/version contact@email') to help us debug in case of problems. This warning will become an error in the future."
+        )
+
+    def submit(seqs, mode, N=101):
+        n, query = N, ""
+        for seq in seqs:
+            query += f">{n}\n{seq}\n"
+            n += 1
+
+        while True:
+            error_count = 0
+            try:
+                # https://requests.readthedocs.io/en/latest/user/advanced/#advanced
+                # "good practice to set connect timeouts to slightly larger than a multiple of 3"
+                res = requests.post(
+                    f"{host_url}/{submission_endpoint}",
+                    data={"q": query, "mode": mode},
+                    timeout=6.02,
+                    headers=headers,
+                )
+            except requests.exceptions.Timeout:
+                logger.warning("Timeout while submitting to MSA server. Retrying...")
+                continue
+            except Exception as e:
+                error_count += 1
+                logger.warning(
+                    f"Error while fetching result from MSA server. Retrying... ({error_count}/5)"
+                )
+                logger.warning(f"Error: {e}")
+                time.sleep(5)
+                if error_count > 5:
+                    raise
+                continue
+            break
+
+        try:
+            out = res.json()
+        except ValueError:
+            logger.error(f"Server didn't reply with json: {res.text}")
+            out = {"status": "ERROR"}
+        return out
+
+    def status(ID):
+        while True:
+            error_count = 0
+            try:
+                res = requests.get(
+                    f"{host_url}/ticket/{ID}", timeout=6.02, headers=headers
+                )
+            except requests.exceptions.Timeout:
+                logger.warning(
+                    "Timeout while fetching status from MSA server. Retrying..."
+                )
+                continue
+            except Exception as e:
+                error_count += 1
+                logger.warning(
+                    f"Error while fetching result from MSA server. Retrying... ({error_count}/5)"
+                )
+                logger.warning(f"Error: {e}")
+                time.sleep(5)
+                if error_count > 5:
+                    raise
+                continue
+            break
+        try:
+            out = res.json()
+        except ValueError:
+            logger.error(f"Server didn't reply with json: {res.text}")
+            out = {"status": "ERROR"}
+        return out
+
+    def download(ID, path):
+        error_count = 0
+        while True:
+            try:
+                res = requests.get(
+                    f"{host_url}/result/download/{ID}", timeout=6.02, headers=headers
+                )
+            except requests.exceptions.Timeout:
+                logger.warning(
+                    "Timeout while fetching result from MSA server. Retrying..."
+                )
+                continue
+            except Exception as e:
+                error_count += 1
+                logger.warning(
+                    f"Error while fetching result from MSA server. Retrying... ({error_count}/5)"
+                )
+                logger.warning(f"Error: {e}")
+                time.sleep(5)
+                if error_count > 5:
+                    raise
+                continue
+            break
+        with open(path, "wb") as out:
+            out.write(res.content)
+
+    # process input x
+    seqs = [x] if isinstance(x, str) else x
+
+    # compatibility to old option
+    if filter is not None:
+        use_filter = filter
+
+    # setup mode
+    if use_filter:
+        mode = "env" if use_env else "all"
+    else:
+        mode = "env-nofilter" if use_env else "nofilter"
+
+    if use_pairing:
+        use_templates = False
+        mode = ""
+        # greedy is default, complete was the previous behavior
+        if pairing_strategy == "greedy":
+            mode = "pairgreedy"
+        elif pairing_strategy == "complete":
+            mode = "paircomplete"
+        if use_env:
+            mode = mode + "-env"
+
+    # define path
+    path = f"{prefix}_{mode}"
+    if not os.path.isdir(path):
+        os.mkdir(path)
+
+    # call mmseqs2 api
+    tar_gz_file = f"{path}/out.tar.gz"
+    N, REDO = 101, True
+
+    # deduplicate and keep track of order
+    seqs_unique = []
+    # TODO this might be slow for large sets
+    [seqs_unique.append(x) for x in seqs if x not in seqs_unique]
+    Ms = [N + seqs_unique.index(seq) for seq in seqs]
+    # lets do it!
+    if not os.path.isfile(tar_gz_file):
+        TIME_ESTIMATE = 150 * len(seqs_unique)
+        with tqdm(total=TIME_ESTIMATE, bar_format=TQDM_BAR_FORMAT) as pbar:
+            while REDO:
+                pbar.set_description("SUBMIT")
+
+                # Resubmit job until it goes through
+                out = submit(seqs_unique, mode, N)
+                while out["status"] in ["UNKNOWN", "RATELIMIT"]:
+                    sleep_time = 5 + random.randint(0, 5)
+                    logger.error(f"Sleeping for {sleep_time}s. Reason: {out['status']}")
+                    # resubmit
+                    time.sleep(sleep_time)
+                    out = submit(seqs_unique, mode, N)
+
+                if out["status"] == "ERROR":
+                    raise Exception(
+                        "MMseqs2 API is giving errors. Please confirm your input is a valid protein sequence. If error persists, please try again an hour later."
+                    )
+
+                if out["status"] == "MAINTENANCE":
+                    raise Exception(
+                        "MMseqs2 API is undergoing maintenance. Please try again in a few minutes."
+                    )
+
+                # wait for job to finish
+                ID, TIME = out["id"], 0
+                pbar.set_description(out["status"])
+                while out["status"] in ["UNKNOWN", "RUNNING", "PENDING"]:
+                    t = 5 + random.randint(0, 5)
+                    logger.error(f"Sleeping for {t}s. Reason: {out['status']}")
+                    time.sleep(t)
+                    out = status(ID)
+                    pbar.set_description(out["status"])
+                    if out["status"] == "RUNNING":
+                        TIME += t
+                        pbar.update(n=t)
+                    # if TIME > 900 and out["status"] != "COMPLETE":
+                    #  # something failed on the server side, need to resubmit
+                    #  N += 1
+                    #  break
+
+                if out["status"] == "COMPLETE":
+                    if TIME < TIME_ESTIMATE:
+                        pbar.update(n=(TIME_ESTIMATE - TIME))
+                    REDO = False
+
+                if out["status"] == "ERROR":
+                    REDO = False
+                    raise Exception(
+                        "MMseqs2 API is giving errors. Please confirm your input is a valid protein sequence. If error persists, please try again an hour later."
+                    )
+
+            # Download results
+            download(ID, tar_gz_file)
+
+    # prep list of a3m files
+    if use_pairing:
+        a3m_files = [f"{path}/pair.a3m"]
+    else:
+        a3m_files = [f"{path}/uniref.a3m"]
+        if use_env:
+            a3m_files.append(f"{path}/bfd.mgnify30.metaeuk30.smag30.a3m")
+
+    # extract a3m files
+    if any(not os.path.isfile(a3m_file) for a3m_file in a3m_files):
+        with tarfile.open(tar_gz_file) as tar_gz:
+            tar_gz.extractall(path)
+
+    # templates
+    if use_templates:
+        templates = {}
+        # print("seq\tpdb\tcid\tevalue")
+        for line in open(f"{path}/pdb70.m8", "r"):
+            p = line.rstrip().split()
+            M, pdb, _, _ = p[0], p[1], p[2], p[10]
+            M = int(M)
+            if M not in templates:
+                templates[M] = []
+            templates[M].append(pdb)
+            # if len(templates[M]) <= 20:
+            #  print(f"{int(M)-N}\t{pdb}\t{qid}\t{e_value}")
+
+        template_paths = {}
+        for k, TMPL in templates.items():
+            TMPL_PATH = f"{prefix}_{mode}/templates_{k}"
+            if not os.path.isdir(TMPL_PATH):
+                os.mkdir(TMPL_PATH)
+                TMPL_LINE = ",".join(TMPL[:20])
+                response = None
+                while True:
+                    error_count = 0
+                    try:
+                        # https://requests.readthedocs.io/en/latest/user/advanced/#advanced
+                        # "good practice to set connect timeouts to slightly larger than a multiple of 3"
+                        response = requests.get(
+                            f"{host_url}/template/{TMPL_LINE}",
+                            stream=True,
+                            timeout=6.02,
+                            headers=headers,
+                        )
+                    except requests.exceptions.Timeout:
+                        logger.warning(
+                            "Timeout while submitting to template server. Retrying..."
+                        )
+                        continue
+                    except Exception as e:
+                        error_count += 1
+                        logger.warning(
+                            f"Error while fetching result from template server. Retrying... ({error_count}/5)"
+                        )
+                        logger.warning(f"Error: {e}")
+                        time.sleep(5)
+                        if error_count > 5:
+                            raise
+                        continue
+                    break
+                with tarfile.open(fileobj=response.raw, mode="r|gz") as tar:
+                    tar.extractall(path=TMPL_PATH)
+                os.symlink("pdb70_a3m.ffindex", f"{TMPL_PATH}/pdb70_cs219.ffindex")
+                with open(f"{TMPL_PATH}/pdb70_cs219.ffdata", "w") as f:
+                    f.write("")
+            template_paths[k] = TMPL_PATH
+
+    # gather a3m lines
+    a3m_lines = {}
+    for a3m_file in a3m_files:
+        update_M, M = True, None
+        for line in open(a3m_file, "r"):
+            if len(line) > 0:
+                if "\x00" in line:
+                    line = line.replace("\x00", "")
+                    update_M = True
+                if line.startswith(">") and update_M:
+                    M = int(line[1:].rstrip())
+                    update_M = False
+                    if M not in a3m_lines:
+                        a3m_lines[M] = []
+                a3m_lines[M].append(line)
+
+    # return results
+
+    a3m_lines = ["".join(a3m_lines[n]) for n in Ms]
+
+    if use_templates:
+        template_paths_ = []
+        for n in Ms:
+            if n not in template_paths:
+                template_paths_.append(None)
+                # print(f"{n-N}\tno_templates_found")
+            else:
+                template_paths_.append(template_paths[n])
+        template_paths = template_paths_
+
+    return (a3m_lines, template_paths) if use_templates else a3m_lines
+
+
+def generate_colabfold_msas(protein_seqs: list[str], msa_dir: Path):
+    """
+    Generate MSAs using the ColabFold (https://github.com/sokrypton/ColabFold)
+    server.
+
+    N.B. the MSAs in our technical report were generated using jackhmmer, not
+    ColabFold, so we would expect some difference in results.
+
+    This implementation also relies on ColabFold's chain pairing algorithm
+    rather than using Chai-1's own algorithm, which could also lead to
+    differences in results.
+    """
+    assert msa_dir.is_dir(), "MSA directory must be a dir"
+    assert not any(msa_dir.iterdir()), "MSA directory must be empty"
+
+    with tempfile.TemporaryDirectory() as tmp_dir_path:
+        tmp_dir = Path(tmp_dir_path)
+
+        mmseqs_dir = tmp_dir / "mmseqs"
+        mmseqs_dir.mkdir()
+
+        a3ms_dir = tmp_dir / "a3ms"
+        a3ms_dir.mkdir()
+
+        # Generate MSAs for each protein chain
+        print(f"Running MSA generation for {len(protein_seqs)} protein sequences")
+        msas = _run_mmseqs2(
+            protein_seqs,
+            mmseqs_dir,
+            # N.B. we can set this to False to disable pairing
+            use_pairing=len(protein_seqs) > 1,
+            user_agent="chai-lab/0.4.0 feedback@chaidiscovery.com",
+        )
+        assert isinstance(msas, list)
+
+        # Process the MSAs into our internal format
+        for protein_seq, msa in zip(protein_seqs, msas, strict=True):
+            # Write out an A3M file
+            a3m_path = a3ms_dir / f"{hash_sequence(protein_seq.upper())}.a3m"
+            a3m_path.write_text(msa)
+
+            # Convert the A3M file into aligned parquet files
+            msa_fasta = read_fasta(a3m_path)
+            headers, msa_seqs = zip(*msa_fasta)
+
+            # This shouldn't have much of an effect on the model, but we make
+            # a best effort to synthesize a source database anyway
+            source_databases = ["query"] + [
+                "uniref90" if h.startswith("UniRef") else "bfd_uniclust"
+                for h in headers[1:]
+            ]
+
+            # Map the MSAs to our internal format
+            aligned_df = pd.DataFrame(
+                data=dict(
+                    sequence=msa_seqs,
+                    source_database=source_databases,
+                    # ColabFold does not return taxonomies from its API, so we
+                    # can't rely on our internal chain pairing logic. As an
+                    # alternative, we could disable ColabFold pairing and rely
+                    # on a mapping from sequence ~> taxonomy, which would allow
+                    # us to use our internal pairing logic.
+                    pairing_key="",
+                    comment="",
+                ),
+            )
+            msa_path = msa_dir / expected_basename(protein_seq)
+            assert not msa_path.exists()
+            aligned_df.to_parquet(msa_path)
diff --git a/examples/msas/README.md b/examples/msas/README.md
index 87977e6..2b396d5 100644
--- a/examples/msas/README.md
+++ b/examples/msas/README.md
@@ -2,6 +2,8 @@
 
 While Chai-1 performs very well in "single-sequence mode," it can also be given additional evolutionary information to further improve performance. As in other folding methods, this evolutionary information is provided in the form of a multiple sequence alignment (MSA). This information is given in the form of a `MSAContext` object (see `chai_lab/data/dataset/msas/msa_context.py`); we provide code for building these `MSAContext` objects through `aligned.pqt` files, though you can play with building out an `MSAContext` yourself as well. 
 
+Multiple strategies can be used for generating MSAs. In our [technical report](https://chaiassets.com/chai-1/paper/technical_report_v1.pdf), we generated MSAs using [jackhmmer](https://github.com/EddyRivasLab/hmmer). Other algorithms such as [MMseqs2](https://github.com/soedinglab/MMseqs2) can also be used.  We provide an example of how to generate MSAs using [ColabFold](https://github.com/sokrypton/ColabFold) in `examples/msas/predict_with_msas.py`. Performance will vary depending on the input MSA databases and search algorithms used.
+
 ## The `.aligned.pqt` file format
 
 The easiest way to provide MSA information to Chai-1 is through the `.aligned.pqt` file format that we have defined. This file can be thought of as an augmented `a3m` file, and is essentially a dataframe saved in parquet format with the following four (required) columns:
diff --git a/examples/msas/predict_with_msas.py b/examples/msas/predict_with_msas.py
new file mode 100644
index 0000000..3f260ba
--- /dev/null
+++ b/examples/msas/predict_with_msas.py
@@ -0,0 +1,56 @@
+import tempfile
+from pathlib import Path
+
+import numpy as np
+import torch
+
+from chai_lab.chai1 import run_inference
+from chai_lab.data.dataset.inference_dataset import read_inputs
+from chai_lab.data.dataset.msas.colabfold import generate_colabfold_msas
+from chai_lab.data.parsing.structure.entity_type import EntityType
+
+tmp_dir = Path(tempfile.mkdtemp())
+
+# Prepare input fasta
+example_fasta = """
+>protein|name=example-of-long-protein
+AGSHSMRYFSTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASPRGEPRAPWVEQEGPEYWDRETQKYKRQAQTDRVSLRNLRGYYNQSEAGSHTLQWMFGCDLGPDGRLLRGYDQSAYDGKDYIALNEDLRSWTAADTAAQITQRKWEAAREAEQRRAYLEGTCVEWLRRYLENGKETLQRAEHPKTHVTHHPVSDHEATLRCWALGFYPAEITLTWQWDGEDQTQDTELVETRPAGDGTFQKWAAVVVPSGEEQRYTCHVQHEGLPEPLTLRWEP
+>protein|name=example-of-short-protein
+AIQRTPKIQVYSRHPAENGKSNFLNCYVSGFHPSDIEVDLLKNGERIEKVEHSDLSFSKDWSFYLLYYTEFTPTEKDEYACRVNHVTLSQPKIVKWDRDM
+>protein|name=example-peptide
+GAAL
+>ligand|name=example-ligand-as-smiles
+CCCCCCCCCCCCCC(=O)O
+""".strip()
+fasta_path = tmp_dir / "example.fasta"
+fasta_path.write_text(example_fasta)
+
+# Generate MSAs
+msa_dir = tmp_dir / "msas"
+msa_dir.mkdir()
+protein_seqs = [
+    input.sequence
+    for input in read_inputs(fasta_path)
+    if input.entity_type == EntityType.PROTEIN.value
+]
+generate_colabfold_msas(protein_seqs=protein_seqs, msa_dir=msa_dir)
+
+
+# Generate structure
+output_dir = tmp_dir / "outputs"
+candidates = run_inference(
+    fasta_file=fasta_path,
+    output_dir=output_dir,
+    # 'default' setup
+    num_trunk_recycles=3,
+    num_diffn_timesteps=200,
+    seed=42,
+    device=torch.device("cuda:0"),
+    use_esm_embeddings=True,
+    msa_directory=msa_dir,
+)
+cif_paths = candidates.cif_paths
+scores = [rd.aggregate_score for rd in candidates.ranking_data]
+
+# Load pTM, ipTM, pLDDTs and clash scores for sample 2
+scores = np.load(output_dir.joinpath("scores.model_idx_2.npz"))