feat(downsample): support export to multiple paths in Iceberg format (#…

…1720) Adds support for export from the downsampler job to multiple destinations in the Iceberg format. Support for export of CSV-formatted data is removed. --------- Co-authored-by: nikitag55 <[email protected]>
filodb · Feb 26, 2024 · 745eb47 · 745eb47
1 parent feb8a6e
commit 745eb47
Show file tree

Hide file tree

Showing 6 changed files with 438 additions and 293 deletions.
diff --git a/core/src/main/resources/filodb-defaults.conf b/core/src/main/resources/filodb-defaults.conf
@@ -588,22 +588,20 @@ filodb {
 
     data-export {
       enabled = false
-
-      # Spark SaveMode / options.
-      save-mode = "error"
-      format = "csv"
-      options = {
-        "header": "true"
-      }
+      parallelism = 10
+      # Catalog under Unified Catalog
+      catalog = ""
+      # Database under Catalog
+      database = ""
+      # Table format
+      format = "iceberg"
 
       # Describe the sequence of labels that compose a rule-group's key.
       # A time-series should match at most one key.
       key-labels = ["_ws_"]
       # These labels will be dropped from every exported row.
       drop-labels = ["_ws_", "drop-label1"]
 
-      bucket = "file://<path-to-file>"
-
       # Each row's labels are compared against all rule-group keys. If a match is found,
       #   the row's labels are compared *sequentially* against each of the group's rules until
       #   a rule meets both of the following criteria:
@@ -615,6 +613,19 @@ filodb {
       groups = [
         {
           key = ["ws-foo"]
+          # Iceberg table name
+          table = "ws-foo"
+          # Table path
+          table-path = "s3a://<bucket>/<directory>/<catalog>/<database>/ws-foo"
+          # to add additional dynamic label-based columns to the table
+          # Eg: _ws_ is the label key in time series to populate column workspace
+          # Similary, _ns_ is the label key in time series to populate column namespace
+          label-column-mapping = [
+            "_ws_", "workspace",
+            "_ns_", "namespace"
+          ]
+          # Partition Iceberg Table by any of the col from label-column-mapping
+          partition-by-columns = ["namespace"]
           rules = [
             {
               allow-filters = [
@@ -629,26 +640,6 @@ filodb {
           ]
         }
       ]
-
-      # Specifies how to generate a path to an exported time-series.
-      # The sequence of key-value pairs describe a directory path, where the
-      #   time-series are exported to the final directory.
-      # For example, if the path-spec is:
-      #     path-spec = [
-      #       "key1", "value1",
-      #       "key2", "value2"
-      #     ]
-      # then the exported time-series will be stored at the path:
-      #     ~/key1=value1/key2=value2/<file>
-      # All {{label-name}} strings will be replaced with the time-series label's value.
-      # All <<time-spec>> strings will be replaced with the result of formatting
-      #   the export window's end time with the time-spec string.
-      path-spec = [
-        "ws",   "{{_ws_}}-suffix1",
-        "year", "<<YYYY>>-suffix2",
-        "api",  "v1",
-        "ns",   "{{_ns_}}-suffix3"
-      ]
     }
   }