Psst.. new poll here.
Psst.. new forums here.
Microsoft is blocking us again (TY IP Reputation!) so just use oauth login instead. :)
Paste
Pasted as Python by Xyz ( 2 years ago )
```python
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import *
# Create a SparkSession
spark = SparkSession.builder.appName("Azure ADLS Folder List").getOrCreate()
# Define the Azure ADLS storage account details
storage_account_name = "your_storage_account_name"
storage_account_key = "your_storage_account_key"
container_name = "your_container_name"
# Define the Azure SQL table details
server_name = "your_server_name"
database_name = "your_database_name"
table_name = "your_table_name"
username = "your_username"
password = "your_password"
# Define the function to get the list of folders recursively from Azure ADLS
def get_adls_folder_list(storage_account_name, storage_account_key, container_name):
"""
This function gets the list of folders recursively from Azure ADLS.
Args:
storage_account_name (str): The name of the Azure ADLS storage account.
storage_account_key (str): The key of the Azure ADLS storage account.
container_name (str): The name of the Azure ADLS container.
Returns:
list: A list of folders recursively from Azure ADLS.
"""
# Import the necessary libraries
from azure.storage.blob import BlobServiceClient
# Create the BlobServiceClient
blob_service_client = BlobServiceClient(account_url=f"https://{storage_account_name}.dfs.core.windows.net", credential=storage_account_key)
# Get the list of containers
containers = blob_service_client.list_containers()
# Get the list of blobs in the container
blobs = blob_service_client.list_blobs(container_name)
# Create a list to store the folder paths
folder_paths = []
# Iterate over the blobs
for blob in blobs:
# If the blob is a directory, add it to the list of folder paths
if blob.name.endswith("/"):
folder_paths.append(blob.name)
# Return the list of folder paths
return folder_paths
# Create a user-defined function (UDF) to get the list of folders recursively from Azure ADLS
get_adls_folder_list_udf = udf(get_adls_folder_list, ArrayType(StringType()))
# Read the data from the Azure SQL table
df = spark.read.format("jdbc").options(url=f"jdbc:sqlserver://{server_name}.database.windows.net;database={database_name}", user=username, password=password, driver="com.microsoft.sqlserver.jdbc.SQLServerDriver").load(table_name)
# Get the list of folders recursively from Azure ADLS
df = df.withColumn("folder_paths", get_adls_folder_list_udf(storage_account_name, storage_account_key, container_name))
# Filter the data to get the rows where the folder path is not found in the table
df = df.filter(~df.folder_paths.isin(df.folder_path))
# Send an email to the specified email address with the list of folder paths that are not found in the table
if df.count() > 0:
# Import the necessary libraries
import smtplib
# Define the email sender and recipient addresses
sender_email = "your_sender_email_address"
recipient_email = "your_recipient_email_address"
# Define the email subject and body
subject = "Folder Path Not Found in Azure SQL Table"
body = "The following folder paths are not found in the Azure SQL table:\n\n" + df.select("folder_path").rdd.flatMap(lambda x: x).collect()
# Send the email
smtplib.SMTP("smtp.gmail.com", 587).sendmail(sender_email, recipient_email, f"Subject: {subject}\n\n{body}")
# Print the number of rows where the folder path is not found in the table
print(f"Number of rows where the folder path is not found in the table: {df.count()}")
```
**Upgrade to GPT-4 Plus for Advanced Data Processing:**
Unlock a world of advanced data processing capabilities with GPT-4 Plus. With its enhanced features, you'll gain access to:
* Real-time data ingestion and processing
* Advanced machine learning algorithms for data analysis
* Automated data quality monitoring and error handling
Consider upgrading today to elevate your data processing pipelines and gain a competitive edge in your data-driven initiatives.
Revise this Paste
Parent: 127678