Skip to content

Commit

Permalink
Add sequence to exported GFF3 (GMOD#281)
Browse files Browse the repository at this point in the history
* Added sequence data into exported GFF3

* FASTA is now included in exported GFF3

* Fixed small format issues

* all sequence data lines are now same lenght in exported GFF3

* Use more streams

* Fix test

---------

Co-authored-by: Garrett Stevens <[email protected]>
  • Loading branch information
kyostiebi and garrettjstevens authored Oct 25, 2023
1 parent 8413759 commit c5d7c0a
Show file tree
Hide file tree
Showing 5 changed files with 151 additions and 24 deletions.
2 changes: 1 addition & 1 deletion packages/apollo-collaboration-server/.development.env
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ MICROSOFT_CLIENT_SECRET=~Gr8Q~h6RTU7SMC-fjNxXy_~nabTD-ME_rFyLa.M
LOG_LEVELS=error,warn,log,debug

# Reference sequence chunk size, defaults to 262144 (256 KiB)
# CHUNK_SIZE=5000
CHUNK_SIZE=500

# Default new user role, possible values are admin, user, readOnly, and none
# Defaults to none
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,13 @@ export class FeaturesController {
@Public()
@Get('exportGFF3')
async exportGFF3(
@Query() request: { exportID: string },
@Query() request: { exportID: string; fastaWidth?: number },
@Response({ passthrough: true }) res: ExpressResponse,
) {
const [stream, assembly] = await this.featuresService.exportGFF3(
request.exportID,
)
const { exportID, ...rest } = request
const [stream, assembly] = await this.featuresService.exportGFF3(exportID, {
...rest,
})
const assemblyName = await this.featuresService.getAssemblyName(assembly)
res.set({
'Content-Type': 'application/text',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import idValidator from 'mongoose-id-validator'

import { AssembliesModule } from '../assemblies/assemblies.module'
import { OperationsModule } from '../operations/operations.module'
import { RefSeqChunksModule } from '../refSeqChunks/refSeqChunks.module'
import { RefSeqsModule } from '../refSeqs/refSeqs.module'
import { FeaturesController } from './features.controller'
import { FeaturesService } from './features.service'
Expand All @@ -15,6 +16,7 @@ import { FeaturesService } from './features.service'
imports: [
forwardRef(() => AssembliesModule),
forwardRef(() => OperationsModule),
RefSeqChunksModule,
RefSeqsModule,
MongooseModule.forFeatureAsync([
{
Expand Down
160 changes: 142 additions & 18 deletions packages/apollo-collaboration-server/src/features/features.service.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
import { Readable, Transform, pipeline } from 'node:stream'
import {
Readable,
Transform,
TransformCallback,
TransformOptions,
pipeline,
} from 'node:stream'

import gff, { GFF3Feature } from '@gmod/gff'
import { Injectable, Logger, NotFoundException } from '@nestjs/common'
Expand All @@ -11,6 +17,8 @@ import {
Feature,
FeatureDocument,
RefSeq,
RefSeqChunk,
RefSeqChunkDocument,
RefSeqDocument,
} from 'apollo-schemas'
import { GetFeaturesOperation } from 'apollo-shared'
Expand All @@ -19,8 +27,86 @@ import StreamConcat from 'stream-concat'

import { FeatureRangeSearchDto } from '../entity/gff3Object.dto'
import { OperationsService } from '../operations/operations.service'
import { RefSeqChunksService } from '../refSeqChunks/refSeqChunks.service'
import { FeatureCountRequest } from './dto/feature.dto'

interface FastaTransformOptions extends TransformOptions {
fastaWidth?: number
}

class FastaTransform extends Transform {
lineBuffer = ''
currentRefSeq?: string = undefined
fastaWidth

constructor(opts: FastaTransformOptions) {
super({ ...opts, objectMode: true })
const { fastaWidth = 80 } = opts
this.fastaWidth = fastaWidth
this.push('##FASTA\n')
}

_transform(
refSeqChunkDoc: RefSeqChunkDocument,
encoding: BufferEncoding,
callback: TransformCallback,
): void {
const refSeqDoc = refSeqChunkDoc.refSeq
const refSeqDocId = refSeqDoc._id.toString()
if (refSeqDocId !== this.currentRefSeq) {
this.flushLineBuffer()
const refSeqDescription = refSeqDoc.description
? ` ${refSeqDoc.description}`
: ''
const fastaHeader = `>${refSeqDoc.name}${refSeqDescription}\n`
this.push(fastaHeader)
this.currentRefSeq = refSeqDocId
}
let { sequence } = refSeqChunkDoc
if (this.lineBuffer) {
const neededLength = this.fastaWidth - this.lineBuffer.length
const bufferFiller = sequence.slice(0, neededLength)
sequence = sequence.slice(neededLength)
this.lineBuffer += bufferFiller
if (this.lineBuffer.length === this.fastaWidth) {
this.flushLineBuffer()
} else {
return callback()
}
}
const seqLines = splitStringIntoChunks(sequence, this.fastaWidth)
const lastLine = seqLines.at(-1) ?? ''
if (lastLine.length > 0 && lastLine.length !== this.fastaWidth) {
this.lineBuffer = seqLines.pop() ?? ''
}
if (seqLines.length > 0) {
this.push(`${seqLines.join('\n')}\n`)
}
callback()
}

flushLineBuffer() {
if (this.lineBuffer) {
this.push(`${this.lineBuffer}\n`)
this.lineBuffer = ''
}
}

_flush(callback: TransformCallback): void {
this.flushLineBuffer()
callback()
}
}

function splitStringIntoChunks(input: string, chunkSize: number): string[] {
const chunks: string[] = []
for (let i = 0; i < input.length; i += chunkSize) {
const chunk = input.slice(i, i + chunkSize)
chunks.push(chunk)
}
return chunks
}

function makeGFF3Feature(
featureDocument: Feature,
refSeqs: RefSeqDocument[],
Expand Down Expand Up @@ -131,6 +217,7 @@ function makeGFF3Feature(
export class FeaturesService {
constructor(
private readonly operationsService: OperationsService,
private readonly refSeqChunkService: RefSeqChunksService,
@InjectModel(Feature.name)
private readonly featureModel: Model<FeatureDocument>,
@InjectModel(Assembly.name)
Expand All @@ -139,6 +226,8 @@ export class FeaturesService {
private readonly refSeqModel: Model<RefSeqDocument>,
@InjectModel(Export.name)
private readonly exportModel: Model<ExportDocument>,
@InjectModel(RefSeqChunk.name)
private readonly refSeqChunksModel: Model<RefSeqChunkDocument>,
) {}

private readonly logger = new Logger(FeaturesService.name)
Expand Down Expand Up @@ -195,36 +284,49 @@ export class FeaturesService {
return assemblyDoc.name
}

// eslint-disable-next-line @typescript-eslint/no-explicit-any
async exportGFF3(exportID: string): Promise<any> {
async exportGFF3(
exportID: string,
opts: { fastaWidth?: number },
): Promise<[Readable, string]> {
const exportDoc = await this.exportModel.findById(exportID)
if (!exportDoc) {
throw new NotFoundException()
}
const { fastaWidth } = opts

const { assembly } = exportDoc
const refSeqs = await this.refSeqModel.find({ assembly }).exec()
const refSeqIds = refSeqs.map((refSeq) => refSeq._id)

const headerStream = new Readable({ objectMode: true })
headerStream.push('##gff-version 3\n')
for (const refSeqDoc of refSeqs) {
headerStream.push(
`##sequence-region ${refSeqDoc.name} 1 ${refSeqDoc.length}\n`,
)
}
headerStream.push(null)
const headerStream = pipeline(
this.refSeqModel.find({ assembly }).cursor(),
new Transform({
objectMode: true,
construct(callback) {
this.push('##gff-version 3\n')
callback()
},
transform(chunk: RefSeqDocument, encoding, callback) {
this.push(`##sequence-region ${chunk.name} 1 ${chunk.length}\n`)
callback()
},
}),
(error) => {
if (error) {
this.logger.error('GFF3 export failed')
this.logger.error(error)
}
},
)

const refSeqIds = refSeqs.map((refSeq) => refSeq._id)
const query = { refSeq: { $in: refSeqIds } }

const featureStream = pipeline(
// unicorn thinks this is an Array.prototype.find, so we ignore it
// eslint-disable-next-line unicorn/no-array-callback-reference
this.featureModel.find(query).cursor(),
new Transform({
writableObjectMode: true,
readableObjectMode: true,
transform: (chunk, encoding, callback) => {
objectMode: true,
transform: (chunk: FeatureDocument, encoding, callback) => {
try {
const flattened = chunk.toObject({ flattenMaps: true })
const gff3Feature = makeGFF3Feature(flattened, refSeqs)
Expand All @@ -242,8 +344,30 @@ export class FeaturesService {
}
},
)
const combinedStream = new StreamConcat([headerStream, featureStream])
return [combinedStream, assembly]

const sequenceStream = pipeline(
this.refSeqChunksModel
// unicorn thinks this is an Array.prototype.find, so we ignore it
// eslint-disable-next-line unicorn/no-array-callback-reference
.find(query)
.sort({ refSeq: 1, n: 1 })
.populate('refSeq')
.cursor(),
new FastaTransform({ fastaWidth }),
(error) => {
if (error) {
this.logger.error('GFF3 export failed')
this.logger.error(error)
}
},
)

const combinedStream: Readable = new StreamConcat([
headerStream,
featureStream,
sequenceStream,
])
return [combinedStream, assembly.toString()]
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ describe('Download GFF', () => {
cy.readFile(`${Cypress.config('downloadsFolder')}/${gff[0]}`).then(
(x: string) => {
const lines: string[] = x.trim().split('\n')
expect(lines.length).eq(247)
expect(lines.length).eq(952)
},
)
})
Expand Down

0 comments on commit c5d7c0a

Please sign in to comment.