Skip to content

Commit

Permalink
Merge pull request #25 from DataChefHQ/feat/aws-spark-app-on-glue
Browse files Browse the repository at this point in the history
Add Glue Component
  • Loading branch information
farbodahm authored Oct 14, 2024
2 parents 58ea78c + ae85a3e commit 925f9d3
Show file tree
Hide file tree
Showing 11 changed files with 466 additions and 87 deletions.
4 changes: 4 additions & 0 deletions devenv.nix
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@
};
};

# Java is required for PySpark
languages.java.enable = true;
languages.java.jdk.package = pkgs.jdk8; # Java version running on AWS Glue

enterShell = ''
hello
pdm install
Expand Down
12 changes: 3 additions & 9 deletions examples/sparkle/Pulumi.yaml
Original file line number Diff line number Diff line change
@@ -1,11 +1,5 @@
name: object_storage
name: simple-spark-application
runtime:
name: python
options:
toolchain: pip
virtualenv: venv
description: A minimal Azure Native Python Pulumi program
config:
pulumi:tags:
value:
pulumi:template: azure-python
description: A minimal spark application that uses Sparkle
region: eu-west-1
8 changes: 3 additions & 5 deletions examples/sparkle/__main__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import os
from damavand.cloud.provider import AwsProvider
from damavand.factories import SparkControllerFactory

Expand All @@ -10,7 +9,7 @@ def main() -> None:
spark_factory = SparkControllerFactory(
provider=AwsProvider(
app_name="my-app",
region="us-west-2",
region="eu-west-1",
),
tags={"env": "dev"},
)
Expand All @@ -22,10 +21,9 @@ def main() -> None:
CustomerOrders(),
],
)
# app_name = os.getenv("APP_NAME", "products-app") # Get app name on runtime

app_name = os.getenv("APP_NAME", "default_app") # Get app name on runtime

spark_controller.run_application(app_name)
# spark_controller.run_application(app_name)
spark_controller.provision()


Expand Down
15 changes: 14 additions & 1 deletion examples/sparkle/applications/orders.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from sparkle.config import Config
from sparkle.config import Config, IcebergConfig, KafkaReaderConfig
from sparkle.config.kafka_config import KafkaConfig, Credentials
from sparkle.writer.iceberg_writer import IcebergWriter
from sparkle.application import Sparkle
from sparkle.reader.kafka_reader import KafkaReader
Expand All @@ -15,6 +16,18 @@ def __init__(self):
version="0.0.1",
database_bucket="s3://test-bucket",
checkpoints_bucket="s3://test-checkpoints",
iceberg_output=IcebergConfig(
database_name="all_products",
database_path="",
table_name="orders_v1",
),
kafka_input=KafkaReaderConfig(
KafkaConfig(
bootstrap_servers="localhost:9119",
credentials=Credentials("test", "test"),
),
kafka_topic="src_orders_v1",
),
),
readers={"orders": KafkaReader},
writers=[IcebergWriter],
Expand Down
14 changes: 11 additions & 3 deletions examples/sparkle/applications/products.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from sparkle.application import Sparkle
from sparkle.config import Config
from sparkle.config import Config, IcebergConfig, TableConfig
from sparkle.writer.iceberg_writer import IcebergWriter
from sparkle.writer.kafka_writer import KafkaStreamPublisher
from sparkle.reader.table_reader import TableReader

from pyspark.sql import DataFrame
Expand All @@ -16,11 +15,20 @@ def __init__(self):
version="0.0.1",
database_bucket="s3://test-bucket",
checkpoints_bucket="s3://test-checkpoints",
iceberg_output=IcebergConfig(
database_name="all_products",
database_path="",
table_name="products_v1",
),
hive_table_input=TableConfig(
database="source_database",
table="products_v1",
bucket="",
),
),
readers={"products": TableReader},
writers=[
IcebergWriter,
KafkaStreamPublisher,
],
)

Expand Down
42 changes: 28 additions & 14 deletions pdm.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ dependencies = [
"pulumi-azure-native>=2.51.0",
"pulumi-random>=4.16.3",
"sparkle @ git+https://github.com/DataChefHQ/[email protected]",
"damavand @ file:///${PROJECT_ROOT}/",
]
requires-python = ">=3.11.0"
readme = "README.md"
Expand Down
1 change: 0 additions & 1 deletion src/damavand/base/controllers/spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ def application_with_id(self, app_id: str) -> Sparkle:
Returns:
Sparkle: The Spark application.
"""

for app in self.applications:
if app.config.app_id == app_id:
return app
Expand Down
Loading

0 comments on commit 925f9d3

Please sign in to comment.