-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
Support user provided batch size in export
- Loading branch information
Showing
16 changed files
with
374 additions
and
148 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
33 changes: 8 additions & 25 deletions
33
src/main/scala/com/exasol/cloudetl/scriptclasses/ExportTable.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
114 changes: 114 additions & 0 deletions
114
src/main/scala/com/exasol/cloudetl/sink/BatchSizedSink.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
package com.exasol.cloudetl.sink | ||
|
||
import java.util.UUID | ||
|
||
import com.exasol.cloudetl.bucket.Bucket | ||
import com.exasol.cloudetl.data.ExaColumnInfo | ||
import com.exasol.cloudetl.data.Row | ||
import com.exasol.cloudetl.parquet.ParquetRowWriter | ||
import com.exasol.cloudetl.parquet.ParquetWriteOptions | ||
import com.exasol.cloudetl.util.SchemaUtil | ||
|
||
import com.typesafe.scalalogging.LazyLogging | ||
import org.apache.hadoop.fs.Path | ||
|
||
/** | ||
* A specific [[Sink]] implementation with records per file request. | ||
* | ||
* Given the number of records for each file and total number of | ||
* records, it is possible to balance the exported file sizes. Thus, | ||
* small files are not created for the last records. | ||
*/ | ||
@SuppressWarnings(Array("org.wartremover.warts.Null", "org.wartremover.warts.Var")) | ||
final class BatchSizedSink( | ||
nodeId: Long, | ||
vmId: String, | ||
numOfRecords: Long, | ||
columns: Seq[ExaColumnInfo], | ||
override val bucket: Bucket | ||
) extends Sink[Row] | ||
with LazyLogging { | ||
|
||
// scalastyle:off null | ||
|
||
final val DEFAULT_BATCH_SIZE: Int = 100000 | ||
|
||
private val requestedBatchSize: Int = | ||
bucket.properties.get("EXPORT_BATCH_SIZE").fold(DEFAULT_BATCH_SIZE)(_.toInt) | ||
|
||
private val numOfBuckets: Long = math.ceil(numOfRecords / requestedBatchSize.toDouble).toLong | ||
private val batchSize: Long = math.floor(numOfRecords / numOfBuckets.toDouble).toLong | ||
private var leftOvers: Long = numOfRecords % numOfBuckets | ||
|
||
private var writer: Writer[Row] = null | ||
private var recordsCount: Long = 0 | ||
private var totalRecords: Long = 0 | ||
|
||
/** Returns the total number of records written so far. */ | ||
def getTotalRecords(): Long = totalRecords | ||
|
||
/** @inheritdoc */ | ||
override def createWriter(path: String): Writer[Row] = new Writer[Row] { | ||
val newPath = new Path(bucket.bucketPath, path) | ||
val messageType = SchemaUtil.createParquetMessageType(columns, "exasol_export_schema") | ||
val options = ParquetWriteOptions(bucket.properties) | ||
val writer = ParquetRowWriter(newPath, bucket.getConfiguration(), messageType, options) | ||
|
||
override def write(value: Row): Unit = | ||
writer.write(value) | ||
|
||
override def close(): Unit = | ||
writer.close() | ||
} | ||
|
||
/** | ||
* @inheritdoc | ||
* | ||
* We check if the number of records written so far is more than the | ||
* next batch, if so closes current writer and creates a new one with | ||
* a different file path. | ||
*/ | ||
override def write(value: Row): Unit = { | ||
if (shouldRoll()) { | ||
openNewFile() | ||
} | ||
recordsCount += 1 | ||
writer.write(value) | ||
} | ||
|
||
/** @inheritdoc */ | ||
override def close(): Unit = { | ||
totalRecords += recordsCount | ||
recordsCount = 0 | ||
if (writer != null) { | ||
writer.close() | ||
} | ||
} | ||
|
||
@SuppressWarnings(Array("org.wartremover.warts.Return")) | ||
private def shouldRoll(): Boolean = { | ||
if (writer == null) { | ||
return true // scalastyle:ignore return | ||
} | ||
if (leftOvers > 0 && recordsCount >= batchSize + 1) { | ||
leftOvers -= 1 | ||
true | ||
} else if (leftOvers == 0 && recordsCount >= batchSize) { | ||
true | ||
} else { | ||
false | ||
} | ||
} | ||
|
||
private def openNewFile(): Unit = { | ||
close() | ||
writer = createWriter(getNewPath()) | ||
} | ||
|
||
private def getNewPath(): String = { | ||
val uuidStr = UUID.randomUUID.toString.replaceAll("-", "") | ||
s"exa_export_${nodeId}_${vmId}_$uuidStr.parquet" | ||
} | ||
|
||
// scalastyle:on | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
package com.exasol.cloudetl.sink | ||
|
||
import com.exasol.cloudetl.bucket.Bucket | ||
|
||
/** | ||
* An abstract sink representation. | ||
*/ | ||
abstract class Sink[T] { | ||
|
||
/** | ||
* The specific [[com.exasol.cloudetl.bucket.Bucket]] where the files | ||
* will be exported. | ||
*/ | ||
val bucket: Bucket | ||
|
||
/** | ||
* Creates a format (parquet, avro, etc) specific writer. | ||
* | ||
* @param path The file path this writer going to write | ||
*/ | ||
def createWriter(path: String): Writer[T] | ||
|
||
/** | ||
* Writes the provided value. | ||
* | ||
* @param value The specific value to write | ||
*/ | ||
def write(value: T): Unit | ||
|
||
/** | ||
* Finally close the resource used for this sink. | ||
*/ | ||
def close(): Unit | ||
|
||
} | ||
|
||
/** | ||
* An interface for data writers. | ||
*/ | ||
trait Writer[T] { | ||
|
||
/** | ||
* Writes the provided value to the path. | ||
* | ||
* @param value The value to write | ||
*/ | ||
def write(value: T): Unit | ||
|
||
/** Closes the writer. */ | ||
def close(): Unit | ||
|
||
} |
Oops, something went wrong.