I would like to start a Dataproc job in response to log files arriving in GCS bucket. I also do not want to keep a persistent cluster running as new log files arrive only se
I can use WorkflowTemplate API to manage the cluster lifecycle for me. With Dataproc Workflows I don't have to poll for either cluster to be created, or job created, or do any error handling.
Here's my Cloud Function. Set to Cloud Storage bucket
to trigger on Finalize/Create
event:
index.js:
exports.startWorkflow = (event, callback) => {
const {
google
} = require('googleapis');
const region = 'global'
const zone = 'us-central1-a'
const clusterName = 'my-cluster'
const file = event.data;
console.log("Event: ", file);
if (!file.name) {
throw "Skipped processing file!";
}
const queryFileUri = "gs://" + file.bucket + "/" + file.name
console.log("Creating auth client: ");
google.auth.getApplicationDefault(
(err, authClient, projectId) => {
if (authClient.createScopedRequired && authClient.createScopedRequired()) {
authClient = authClient.createScoped([
'https://www.googleapis.com/auth/cloud-platform',
'https://www.googleapis.com/auth/userinfo.email'
]);
}
const request = {
parent: "projects/" + projectId + "/regions/" + region,
resource: {
"placement": {
"managedCluster": {
"clusterName": clusterName,
"config": {
"gceClusterConfig": {
"zoneUri": zone, // Can be omitted if using regional endpoint (like us-central1-a, not global)
}
}
}
},
"jobs": [{
"stepId": "step1",
"pigJob": {
"queryFileUri": queryFileUri,
},
"prerequisiteStepIds": [],
}]
}
};
const dataproc = google.dataproc({
version: 'v1beta2',
auth: authClient
});
dataproc.projects.regions.workflowTemplates.instantiateInline(
request, (err, result) => {
if (err) {
throw err;
}
console.log(result);
callback();
});
});
};
Make sure to set Function to Execute to startWorkflow
.
package.json:
{
"name": "dataproc-workflow",
"version": "1.0.0",
"dependencies":{ "googleapis": "30.0.0"}
}