I would like to start a Dataproc job in response to log files arriving in GCS bucket. I also do not want to keep a persistent cluster running as new log files arrive only se
I can use WorkflowTemplate API to manage the cluster lifecycle for me. With Dataproc Workflows I don't have to poll for either cluster to be created, or job created, or do any error handling.
Here's my Cloud Function. Set to Cloud Storage bucket
to trigger on Finalize/Create
event:
index.js:
exports.startWorkflow = (event, callback) => {
const {
google
} = require('googleapis');
const region = 'global'
const zone = 'us-central1-a'
const clusterName = 'my-cluster'
const file = event.data;
console.log("Event: ", file);
if (!file.name) {
throw "Skipped processing file!";
}
const queryFileUri = "gs://" + file.bucket + "/" + file.name
console.log("Creating auth client: ");
google.auth.getApplicationDefault(
(err, authClient, projectId) => {
if (authClient.createScopedRequired && authClient.createScopedRequired()) {
authClient = authClient.createScoped([
'https://www.googleapis.com/auth/cloud-platform',
'https://www.googleapis.com/auth/userinfo.email'
]);
}
const request = {
parent: "projects/" + projectId + "/regions/" + region,
resource: {
"placement": {
"managedCluster": {
"clusterName": clusterName,
"config": {
"gceClusterConfig": {
"zoneUri": zone, // Can be omitted if using regional endpoint (like us-central1-a, not global)
}
}
}
},
"jobs": [{
"stepId": "step1",
"pigJob": {
"queryFileUri": queryFileUri,
},
"prerequisiteStepIds": [],
}]
}
};
const dataproc = google.dataproc({
version: 'v1beta2',
auth: authClient
});
dataproc.projects.regions.workflowTemplates.instantiateInline(
request, (err, result) => {
if (err) {
throw err;
}
console.log(result);
callback();
});
});
};
Make sure to set Function to Execute to startWorkflow
.
package.json:
{
"name": "dataproc-workflow",
"version": "1.0.0",
"dependencies":{ "googleapis": "30.0.0"}
}
You can put below GCLOUD commands from a shell script or Docker RUN command to:
Delete the Dataproc Cluster (note the --quite or -q option to delete)
gcloud dataproc clusters create devops-poc-dataproc-cluster --subnet default --zone us-central1-a --master-machine-type n1-standard-1 --master-boot-disk-size 200 --num-workers 2 --worker-machine-type n1-standard-2 --worker-boot-disk-size 200 --image-version 1.3-deb9 --project gcp-project-212501 --service-account=service-id1@gcp-project-212501.iam.gserviceaccount.com
sleep 60 && gcloud dataproc jobs submit pyspark /dev_app/spark_poc/wordCountSpark.py --cluster=devops-poc-dataproc-cluster -- gs://gcp-project-212501-docker_bucket/input/ gs://gcp-project-212501-docker_bucket/output/
gcloud dataproc clusters delete -q devops-poc-dataproc-cluster