Azure Synapse Studio Orchestrate And Azure Databricks ETL With Excel

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 |
val configs = Map( "fs.azure.account.auth.type" -> "OAuth", "fs.azure.account.oauth.provider.type" -> "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider", "fs.azure.account.oauth2.client.id" -> dbutils.secrets.get(scope = "jssugkey", key = "ClientId"), "fs.azure.account.oauth2.client.secret" -> dbutils.secrets.get(scope = "jssugkey", key = "MyADLSGen2"), "fs.azure.account.oauth2.client.endpoint" -> dbutils.secrets.get(scope = "jssugkey", key = "DirectoryId")) dbutils.fs.mount( source = "abfss://rawdata@synaadlsgen2.dfs.core.windows.net/", mountPoint = "/mnt/adlsgen2", extraConfigs = configs display(dbutils.fs.mounts()) display(dbutils.fs.ls("/mnt/adlsgen2/Excel")) dbutils.fs.unmount("/mnt/adlsgen2") |
1 2 3 4 5 6 7 |
import org.apache.spark.sql._ val df = spark.read .format("com.crealytics.spark.excel") .option("header", "true") .load("/mnt/adlsgen2/Excel/Person.xlsx") display(df) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
import org.apache.spark.sql._ import com.crealytics.spark.excel._ val df = spark.read.excel( header = true, ).load("/mnt/adlsgen2/Excel/Person.xlsx") display(df) val df = spark.read.excel( header = true, dataAddress = "'SalesTbl_Header'!A4" ).load("/mnt/adlsgen2/Excel/Person.xlsx") display(df) val df = spark.read.excel( header = true, dataAddress = "'2'!A4" ).load("/mnt/adlsgen2/Excel/Person.xlsx") display(df) val df = spark.read.excel( header = true, dataAddress = "Class2[#All]" ).load("/mnt/adlsgen2/Excel/Person.xlsx") display(df) val arraySheetNames = WorkbookReader( Map("path" -> "/mnt/adlsgen2/Excel/Person.xlsx"), spark.sparkContext.hadoopConfiguration ).sheetNames val df = spark.read.excel( header = true, dataAddress = arraySheetNames(0) ).load("/mnt/adlsgen2/Excel/Person.xlsx") display(df) val arraySheetNames = WorkbookReader( Map("path" -> "/mnt/adlsgen2/Excel/Person.xlsx"), spark.sparkContext.hadoopConfiguration ).sheetNames val arraySheetNames = WorkbookReader( Map("path" -> "/mnt/adlsgen2/Excel/Person.xlsx"), spark.sparkContext.hadoopConfiguration ).sheetNames arraySheetNames.foreach { item => if (item == "NameRange"){ var data = spark.read.excel( header = true, dataAddress = "Class1[#All]" ).load("/mnt/adlsgen2/Excel/Person.xlsx") data.repartition(1) .write .format("com.databricks.spark.csv") .mode("Overwrite") .option("header","true") .save("/mnt/adlsgen2/Excel/Output/" + item + ".csv") }else{ var data = spark.read.excel( header = true, dataAddress = item +"!A1" ).load("/mnt/adlsgen2/Excel/Person.xlsx") data.repartition(1) .write .format("com.databricks.spark.csv") .mode("Overwrite") .option("header","true") .save("/mnt/adlsgen2/Excel/Output/" + item + ".csv") } } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 |
import org.apache.spark.sql._ import com.crealytics.spark.excel._ dbutils.widgets.text("inputExcelTableName", "") val inputExcelTableName = dbutils.widgets.get("inputExcelTableName") val df = spark.read.excel( header = true, dataAddress = inputExcelTableName + "[#All]" ).load("/mnt/adlsgen2/Excel/Person.xlsx") df.repartition(1) .write .format("com.databricks.spark.csv") .mode("Overwrite") .option("header","true") .save("/mnt/adlsgen2/Excel/Output/NameRange.csv") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
{ "name": "LoadDatabricksNotebook", "properties": { "activities": [ { "name": "LoadSparkExcel", "type": "DatabricksNotebook", "dependsOn": [], "policy": { "timeout": "7.00:00:00", "retry": 0, "retryIntervalInSeconds": 30, "secureOutput": false, "secureInput": false }, "userProperties": [], "typeProperties": { "notebookPath": "/Users/youkai_yjy@msn.com/LoadSparkExcel", "baseParameters": { "inputExcelTableName": { "value": "@pipeline().parameters.ExcelTableName", "type": "Expression" } } }, "linkedServiceName": { "referenceName": "AzureDatabricks", "type": "LinkedServiceReference" } } ], "parameters": { "ExcelTableName": { "type": "string", "defaultValue": "Class2" } }, "annotations": [], "lastPublishTime": "2020-08-19T15:31:20Z" }, "type": "Microsoft.Synapse/workspaces/pipelines" } |