Skip to content
This repository has been archived by the owner on Aug 25, 2023. It is now read-only.

Feature/direct access part II #249

Draft
wants to merge 10 commits into
base: main
Choose a base branch
from
Draft
2 changes: 2 additions & 0 deletions .github/deploy-bicep/main.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ param sqlServerAdminUser string
param sqlServerAdminPassword string
param sqlAdminSpnName string
param sqlAdminObjectId string
param accesConName string

// Creating permanent resource group
module rgModule 'rg-permanent.bicep' = {
Expand Down Expand Up @@ -82,5 +83,6 @@ module resources2 'resources-integration.bicep' = {
sqlServerAdminPassword: sqlServerAdminPassword
sqlAdminSpnName: sqlAdminSpnName
sqlAdminObjectId: sqlAdminObjectId
accesConName: accesConName
}
}
16 changes: 16 additions & 0 deletions .github/deploy-bicep/resources-integration.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ param sqlServerAdminUser string
param sqlServerAdminPassword string
param sqlAdminSpnName string
param sqlAdminObjectId string
param accesConName string

//#############################################################################################
//# Provision Databricks Workspace
Expand Down Expand Up @@ -69,6 +70,21 @@ resource staccount 'Microsoft.Storage/storageAccounts@2021-09-01' = {
}
}

// #############################################
// # Access connector for data bricks
// https://learn.microsoft.com/en-us/azure/databricks/data-governance/unity-catalog/enable-workspaces
// https://learn.microsoft.com/en-us/azure/databricks/data-governance/unity-catalog/create-metastore
// https://learn.microsoft.com/en-us/azure/databricks/data-governance/unity-catalog/azure-managed-identities#config-managed-id
// #####################################################

resource accCon 'Microsoft.Databricks/accessConnectors@2022-04-01-preview' = {
name: accesConName
location: location
identity: {
type: 'SystemAssigned'
}
}

//#############################################################################################
//# Provision Eventhub namespace and eventhubs
//#############################################################################################
Expand Down
2 changes: 2 additions & 0 deletions .github/deploy/steps/00-Config.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ $resourceGroupName = $resourceName


$databricksName = $resourceName
$accesConName = $resourceName+"-dbaccessconnector"
$dataLakeName = $resourceName
$databaseServerName = $resourceName + "test"
$deliveryDatabase = "Delivery"
Expand Down Expand Up @@ -103,6 +104,7 @@ Write-Host "* Azure SQL database : $deliveryDatabase" -ForegroundCo
Write-Host "* Azure EventHubs Namespace : $ehNamespace" -ForegroundColor White
Write-Host "* Azure CosmosDb name : $cosmosName" -ForegroundColor White
Write-Host "* Mounting SPN Name : $mountSpnName" -ForegroundColor White
Write-Host "* DB Access Connector name : $accesConName" -ForegroundColor White
Write-Host "**********************************************************************" -ForegroundColor White


Expand Down
4 changes: 3 additions & 1 deletion .github/deploy/steps/25-Provision-Service-Principal.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,12 @@ $dbSpn = Get-SpnWithSecret -spnName $dbDeploySpnName -keyVaultName $keyVaultName
# inside databricks

$mountSpn = Get-SpnWithSecret -spnName $mountSpnName -keyVaultName $keyVaultName
$secrets.addSecret("Databricks--OauthEndpoint", "https://login.microsoftonline.com/$tenantId/oauth2/token")
$secrets.addSecret("Databricks--TenantId", $tenantId)
$secrets.addSecret("Databricks--ClientId", $mountSpn.clientId)
$secrets.addSecret("Databricks--ClientSecret", $mountSpn.secretText)
$secrets.addSecret("StorageAccount--Url", $mountSpn.secretText)

# there is a chicken-and-egg problem where we want to save the new SPN secret in the
# keyvault, but the keyvault may not exist yet. This doesn't matter since the keyvault
# is never destroyed, and by the second run, it will exist.
# is never destroyed, and by the second run, it will exist.
18 changes: 18 additions & 0 deletions .github/deploy/steps/26-Direct-Access.ps1
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# THIS COULD BE REMOVED
# IT IS CODED IN THE ATC/SRC/TEST/VALUES.py

# Add a secret for each layer in the storageaccount
foreach ($layer in $dataLakeContainers.Values) {

# Example: atcsilver
$secretName = $layer

# Example: silver@atc
$domainPart = $($layer.ToLower()) + "@" + $dataLakeName

# Example: abfss://[email protected]/
$secretValue = "abfss://$domainPart.dfs.core.windows.net/"
Write-Host " Adds databricks secret $secretName with value $secretValue..." -ForegroundColor DarkYellow

$values.addSecret($secretName, $secretValue)
}
3 changes: 2 additions & 1 deletion .github/deploy/steps/30-deploy-bicep.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ $output = az deployment sub create `
sqlServerAdminUser=$sqlServerAdminUser `
sqlServerAdminPassword=$sqlServerAdminPassword `
sqlAdminSpnName=$sqlAdminSpnName `
sqlAdminObjectId=$sqlAdminObjectId
sqlAdminObjectId=$sqlAdminObjectId `
accesConName=$accesConName


Throw-WhenError -output $output
Expand Down
12 changes: 12 additions & 0 deletions .github/deploy/steps/45-Provision-DbSpn-Roles.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,15 @@ $output = az role assignment create `
--resource-group $resourceGroupName

Throw-WhenError -output $output

Write-Host " Assigning Databricks Access Connector as Blob Contributor"

$DbAccessConnectorSpn = Graph-GetSpn -queryDisplayName $accesConName

$output = az role assignment create `
--role "Storage Blob Data Contributor" `
--assignee-principal-type ServicePrincipal `
--assignee-object-id $DbAccessConnectorSpn.id `
--resource-group $resourceGroupName

Throw-WhenError -output $output
31 changes: 31 additions & 0 deletions .github/deploy/steps/51-create-metastore.ps1
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# When creating a new workspace
# There was no attached metastore

# This script creates the metastore
# Now this could also in the future be used for other purposes.

# It is unknown why it is neccesary to use this now.
# It has not been a requirement to explicitly create a metastore.

$dbworkspaceid = az resource show `
--resource-group $resourceGroupName `
--name $databricksName `
--resource-type "Microsoft.Databricks/workspaces" `
--query properties.workspaceId


$unityCatalog = databricks unity-catalog metastores create `
--name my-metastore `
--region $location `
--storage-root "abfss://silver@$dataLakeName.dfs.core.windows.net/meta"

# Bad way of accessing the catalog id!!
# Please provide a more correct solution.
# The jq utility package could be helpful
# See: https://rajanieshkaushikk.com/2023/01/17/demystifying-azure-databricks-unity-catalog/
$unityCatalogId = (-split $unityCatalog[7])[1] -replace '"', "" -replace ",", ""

databricks unity-catalog metastores assign `
--workspace-id $dbworkspaceid.Replace('"',"") `
--metastore-id $unityCatalogId `
--default-catalog-name main
15 changes: 15 additions & 0 deletions .github/deploy/steps/91-Create-Sparkconf.ps1
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Creating direct access
Write-Host "Write cluster configuration for Direct Access..." -ForegroundColor DarkYellow
$confDirectAccess = [ordered]@{}
$confDirectAccess["spark.databricks.delta.preview.enabled"] = $true
$confDirectAccess["spark.databricks.io.cache.enabled"] = $true
$confDirectAccess["spark.master"]= "local[*, 4]"
$confDirectAccess["spark.databricks.cluster.profile"]= "singleNode"

$confDirectAccess["fs.azure.account.oauth2.client.id.$dataLakeName.dfs.core.windows.net"] = "{{secrets/secrets/Databricks--ClientId}}"
$confDirectAccess["fs.azure.account.oauth2.client.endpoint.$dataLakeName.dfs.core.windows.net"] = "{{secrets/secrets/Databricks--OauthEndpoint}}"
$confDirectAccess["fs.azure.account.oauth.provider.type.$dataLakeName.dfs.core.windows.net"] = "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider"
$confDirectAccess["fs.azure.account.oauth2.client.secret.$dataLakeName.dfs.core.windows.net"] = "{{secrets/secrets/Databricks--ClientSecret}}"
$confDirectAccess["fs.azure.account.auth.type.$dataLakeName.dfs.core.windows.net"] = "OAuth"

Set-Content $repoRoot\.github\submit\sparkconf.json ($confDirectAccess | ConvertTo-Json)
41 changes: 22 additions & 19 deletions .github/deploy/steps/99-SetupMounts.ps1
Original file line number Diff line number Diff line change
@@ -1,28 +1,31 @@

$srcDir = "$PSScriptRoot/../../.."
# This is unused - since mounting is
# no longer a recommended way to access storage accounts.

Push-Location -Path $srcDir
# $srcDir = "$PSScriptRoot/../../.."

pip install dbx
# Push-Location -Path $srcDir

dbx configure
#copy "$srcDir/.github/submit/sparklibs.json" "$srcDir/tests/cluster/mount/"
# pip install dbx

$mountsJson = (,@(
@{
storageAccountName=$resourceName
secretScope="secrets"
clientIdName="Databricks--ClientId"
clientSecretName="Databricks--ClientSecret"
tenantIdName="Databricks--TenantId"
containers = [array]$($dataLakeContainers | ForEach-Object{ $_.name })
}
))
# dbx configure
# #copy "$srcDir/.github/submit/sparklibs.json" "$srcDir/tests/cluster/mount/"

$mountsJson | ConvertTo-Json -Depth 4 | Set-Content "$srcDir/tests/cluster/mount/mounts.json"
# $mountsJson = (,@(
# @{
# storageAccountName=$resourceName
# secretScope="secrets"
# clientIdName="Databricks--ClientId"
# clientSecretName="Databricks--ClientSecret"
# tenantIdName="Databricks--TenantId"
# containers = [array]$($dataLakeContainers | ForEach-Object{ $_.name })
# }
# ))

dbx deploy --deployment-file "$srcDir/tests/cluster/mount/setup_job.yml.j2"
# $mountsJson | ConvertTo-Json -Depth 4 | Set-Content "$srcDir/tests/cluster/mount/mounts.json"

dbx launch --job="Setup Mounts" --trace --kill-on-sigterm
# dbx deploy --deployment-file "$srcDir/tests/cluster/mount/setup_job.yml.j2"

Pop-Location
# dbx launch --job="Setup Mounts" --trace --kill-on-sigterm

# Pop-Location
10 changes: 4 additions & 6 deletions .github/submit/submit_test_job.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ param (
$sparkLibs = "sparklibs91.json"




)

# get the true repository root
Expand Down Expand Up @@ -82,18 +84,14 @@ Pop-Location
# remote path of the log
$logOut = "$testDir/results.log"


# construct the run submission configuration
$run = @{
run_name = "Testing Run"
# single node cluster is sufficient
new_cluster= @{
spark_version=$sparkVersion
spark_conf= @{
"spark.databricks.cluster.profile"= "singleNode"
"spark.master"= "local[*, 4]"
"spark.databricks.delta.preview.enabled"= $true
"spark.databricks.io.cache.enabled"= $true
}
spark_conf = Get-Content "$PSScriptRoot/sparkconf.json" | ConvertFrom-Json
azure_attributes=${
"availability"= "ON_DEMAND_AZURE",
"first_on_demand": 1,
Expand Down
10 changes: 10 additions & 0 deletions src/atc/mount/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,20 @@
import json
from types import SimpleNamespace

import deprecation

import atc
from atc.exceptions import AtcException
from atc.functions import init_dbutils


@deprecation.deprecated(
deprecated_in="1.0.60",
removed_in="2.0.0",
current_version=atc.__version__,
details="use direct access instead. "
"See the atc-dataplatform unittests, for example.",
)
def main():
parser = argparse.ArgumentParser(description="atc-dataplatform mountpoint setup.")
parser.add_argument(
Expand Down
11 changes: 8 additions & 3 deletions tests/cluster/delta/test_delta_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from atc.etl.extractors.schema_extractor import SchemaExtractor
from atc.etl.loaders import SimpleLoader
from atc.spark import Spark
from tests.cluster.values import directAccessContainer


class DeltaTests(unittest.TestCase):
Expand All @@ -19,14 +20,18 @@ def setUpClass(cls) -> None:
def test_01_configure(self):
tc = Configurator()
tc.register(
"MyDb", {"name": "TestDb{ID}", "path": "/mnt/atc/silver/testdb{ID}"}
"MyDb",
{
"name": "TestDb{ID}",
"path": directAccessContainer("silver") + "testdb{ID}",
},
)

tc.register(
"MyTbl",
{
"name": "TestDb{ID}.TestTbl",
"path": "/mnt/atc/silver/testdb{ID}/testtbl",
"path": directAccessContainer("silver") + "testdb{ID}/testtbl",
},
)

Expand All @@ -40,7 +45,7 @@ def test_01_configure(self):
tc.register(
"MyTbl3",
{
"path": "/mnt/atc/silver/testdb/testtbl3",
"path": directAccessContainer("silver") + "testdb/testtbl3",
},
)

Expand Down
11 changes: 11 additions & 0 deletions tests/cluster/values.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,14 @@ def getValue(secret_name: str):

def resourceName():
return getValue("resourceName")


def storageAccount():
return resourceName()


def directAccessContainer(containername: str):
domainPart = containername + "@" + storageAccount()

# Example: abfss://[email protected]/
return "abfss://" + domainPart + ".dfs.core.windows.net/"