I have a node.js program calling a Postgres (Amazon RDS micro instance) function, get_jobs
within a transaction, 18 times a second using the node-postgres
I used this to great effect with SQL Server and I don't trust any query optimiser now
Then don't use them. You can still execute queries directly, as shown below.
but please tell me if this is the wrong approach for Postgres!
It is not a completely wrong approach, it's just a very awkward one, as you are trying to create something that's been implemented by others for a much easier use. As a result, you are making many mistakes that can lead to many problems, including memory leaks.
Compare to the simplicity of the exact same example that uses pg-promise:
var pgp = require('pg-promise')();
var conString = "postgres://username:password@server/database";
var db = pgp(conString);
function getJobs() {
return db.tx(function (t) {
return t.func('get_jobs');
});
}
function poll() {
getJobs()
.then(function (jobs) {
// process the jobs
})
.catch(function (error) {
// error
});
setTimeout(poll, 55);
}
poll(); // start polling
Gets even simpler when using ES6 syntax:
var pgp = require('pg-promise')();
var conString = "postgres://username:password@server/database";
var db = pgp(conString);
function poll() {
db.tx(t=>t.func('get_jobs'))
.then(jobs=> {
// process the jobs
})
.catch(error=> {
// error
});
setTimeout(poll, 55);
}
poll(); // start polling
The only thing that I didn't quite understand in your example - the use of a transaction to execute a single SELECT
. This is not what transactions are generally for, as you are not changing any data. I assume you were trying to shrink a real piece of code you had that changes some data also.
In case you don't need a transaction, your code can be further reduced to:
var pgp = require('pg-promise')();
var conString = "postgres://username:password@server/database";
var db = pgp(conString);
function poll() {
db.func('get_jobs')
.then(jobs=> {
// process the jobs
})
.catch(error=> {
// error
});
setTimeout(poll, 55);
}
poll(); // start polling
UPDATE
It would be a dangerous approach, however, not to control the end of the previous request, which also may create memory/connection issues.
A safe approach should be:
function poll() {
db.tx(t=>t.func('get_jobs'))
.then(jobs=> {
// process the jobs
setTimeout(poll, 55);
})
.catch(error=> {
// error
setTimeout(poll, 55);
});
}
Use CTEs to create partial result sets instead of temp tables.
CREATE OR REPLACE FUNCTION get_jobs (
) RETURNS TABLE (
...
) AS
$BODY$
DECLARE
_nowstamp bigint;
BEGIN
-- take the current unix server time in ms
_nowstamp := (select extract(epoch from now()) * 1000)::bigint;
RETURN query (
-- 1. get the jobs that are due
WITH jobs AS (
select ...
from really_big_table_1
where job_time < _nowstamp;
-- 2. get other stuff attached to those jobs
), jobs_extra AS (
select ...
from really_big_table_2 r
inner join jobs j on r.id = j.some_id
)
-- 3. return the final result with a join to a third big table
select je.id, ...
from jobs_extra je
left join really_big_table_3 r on je.id = r.id
group by je.id
);
END
$BODY$ LANGUAGE plpgsql VOLATILE;
The planner will evaluate each block in sequence the way I wanted to achieve with temp tables.
I know this doesn't directly solve the memory leak issue (I'm pretty sure there's something wrong with Postgres' implementation of them, at least the way they manifest on the RDS configuration).
However, the query works, it is query planned the way I was intending and the memory usage is stable now after 3 days of running the job and my server doesn't crash.
I didn't change the node code at all.