问题
I'm having trouble wrapping my head around using a string function such as STARTS_WITH
or an operator such as LIKE
in a LEFT JOIN ON
where the parameters of either come from both tables in the join. Rather than try to explain the abstracts, I've distilled a small example ...
Let's consider a table called fuzzylog that has a key field fullname
that I want to canonicalize by joining to a names table that has the same column. The key field in fuzzylog may be a bit messy or free-form so a direct equality join is not possible. These tables might look something like this:
fuzzylog
table:
names
table:
The names table tries to accomodate the fuzziness by providing a contains
field which I want to join on using LIKE
should the exact match fail:
#standardSQL
SELECT l.id, n.fullname, n.nameid,
l.fullname AS logged_fullname
FROM `neilotemp.fuzzylog` l
LEFT JOIN `neilotemp.names` n
ON l.fullname = n.fullname
OR l.fullname LIKE CONCAT('%', n.contains, '%')
Unfortunately that last line, the one I really need, causes the error: LEFT OUTER JOIN cannot be used without a condition that is an equality of fields from both sides of the join. This is really what I'm trying to solve.
I've done quite a few searches but haven't been able to find anything that is getting it to click in my head. I must be one conceptual step off from the right path.
回答1:
Honestly, I think using contains
field is not the best idea here
Instead consider below approach with using Levenshtein [edit] distance
#standardSQL
CREATE TEMPORARY FUNCTION EDIT_DISTANCE(string1 STRING, string2 STRING)
RETURNS INT64
LANGUAGE js AS """
var _extend = function(dst) {
var sources = Array.prototype.slice.call(arguments, 1);
for (var i=0; i<sources.length; ++i) {
var src = sources[i];
for (var p in src) {
if (src.hasOwnProperty(p)) dst[p] = src[p];
}
}
return dst;
};
var Levenshtein = {
/**
* Calculate levenshtein distance of the two strings.
*
* @param str1 String the first string.
* @param str2 String the second string.
* @return Integer the levenshtein distance (0 and above).
*/
get: function(str1, str2) {
// base cases
if (str1 === str2) return 0;
if (str1.length === 0) return str2.length;
if (str2.length === 0) return str1.length;
// two rows
var prevRow = new Array(str2.length + 1),
curCol, nextCol, i, j, tmp;
// initialise previous row
for (i=0; i<prevRow.length; ++i) {
prevRow[i] = i;
}
// calculate current row distance from previous row
for (i=0; i<str1.length; ++i) {
nextCol = i + 1;
for (j=0; j<str2.length; ++j) {
curCol = nextCol;
// substution
nextCol = prevRow[j] + ( (str1.charAt(i) === str2.charAt(j)) ? 0 : 1 );
// insertion
tmp = curCol + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// deletion
tmp = prevRow[j + 1] + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// copy current col value into previous (in preparation for next iteration)
prevRow[j] = curCol;
}
// copy last col value into previous (in preparation for next iteration)
prevRow[j] = nextCol;
}
return nextCol;
}
};
var the_string1;
try {
the_string1 = decodeURI(string1).toLowerCase();
} catch (ex) {
the_string1 = string1.toLowerCase();
}
try {
the_string2 = decodeURI(string2).toLowerCase();
} catch (ex) {
the_string2 = string2.toLowerCase();
}
return Levenshtein.get(the_string1, the_string2)
""";
WITH notrmalized_fuzzylog as (
select id, fullname,
(select string_agg(part, ' ' order by part) from unnest(split(fullname, ' ')) part) ordered_fullname
from `project.dataset.fuzzylog`
), normalized_names as (
select nameid, fullname,
(select string_agg(part, ' ' order by part) from unnest(split(fullname, ' ')) part) ordered_fullname
from `project.dataset.names`
)
select
id, l.fullname AS logged_fullname,
ARRAY_AGG(
STRUCT(n.nameid, n.fullname)
ORDER BY EDIT_DISTANCE(l.ordered_fullname, n.ordered_fullname) LIMIT 1
)[OFFSET(0)].*
FROM notrmalized_fuzzylog l
CROSS JOIN normalized_names n
GROUP BY 1, 2
You can test, play with above using dummy data from your question as below
#standardSQL
CREATE TEMPORARY FUNCTION EDIT_DISTANCE(string1 STRING, string2 STRING)
RETURNS INT64
LANGUAGE js AS """
var _extend = function(dst) {
var sources = Array.prototype.slice.call(arguments, 1);
for (var i=0; i<sources.length; ++i) {
var src = sources[i];
for (var p in src) {
if (src.hasOwnProperty(p)) dst[p] = src[p];
}
}
return dst;
};
var Levenshtein = {
/**
* Calculate levenshtein distance of the two strings.
*
* @param str1 String the first string.
* @param str2 String the second string.
* @return Integer the levenshtein distance (0 and above).
*/
get: function(str1, str2) {
// base cases
if (str1 === str2) return 0;
if (str1.length === 0) return str2.length;
if (str2.length === 0) return str1.length;
// two rows
var prevRow = new Array(str2.length + 1),
curCol, nextCol, i, j, tmp;
// initialise previous row
for (i=0; i<prevRow.length; ++i) {
prevRow[i] = i;
}
// calculate current row distance from previous row
for (i=0; i<str1.length; ++i) {
nextCol = i + 1;
for (j=0; j<str2.length; ++j) {
curCol = nextCol;
// substution
nextCol = prevRow[j] + ( (str1.charAt(i) === str2.charAt(j)) ? 0 : 1 );
// insertion
tmp = curCol + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// deletion
tmp = prevRow[j + 1] + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// copy current col value into previous (in preparation for next iteration)
prevRow[j] = curCol;
}
// copy last col value into previous (in preparation for next iteration)
prevRow[j] = nextCol;
}
return nextCol;
}
};
var the_string1;
try {
the_string1 = decodeURI(string1).toLowerCase();
} catch (ex) {
the_string1 = string1.toLowerCase();
}
try {
the_string2 = decodeURI(string2).toLowerCase();
} catch (ex) {
the_string2 = string2.toLowerCase();
}
return Levenshtein.get(the_string1, the_string2)
""";
WITH `project.dataset.fuzzylog` AS (
SELECT 1 id, 'John Smith' fullname UNION ALL
SELECT 2, 'Jane Doe' UNION ALL
SELECT 3, 'Ms. Jane Doe' UNION ALL
SELECT 4, 'Mr. John Smith' UNION ALL
SELECT 5, 'Smith, John' UNION ALL
SELECT 6, 'J.Smith' UNION ALL
SELECT 7, 'J. Doe'
), `project.dataset.names` AS (
SELECT 1 nameid, 'John Smith' fullname, 'smith' match UNION ALL
SELECT 2, 'Jane Doe', 'doe'
), notrmalized_fuzzylog as (
select id, fullname,
(select string_agg(part, ' ' order by part) from unnest(split(fullname, ' ')) part) ordered_fullname
from `project.dataset.fuzzylog`
), normalized_names as (
select nameid, fullname,
(select string_agg(part, ' ' order by part) from unnest(split(fullname, ' ')) part) ordered_fullname
from `project.dataset.names`
)
select
id, l.fullname AS logged_fullname,
ARRAY_AGG(
STRUCT(n.nameid, n.fullname)
ORDER BY EDIT_DISTANCE(l.ordered_fullname, n.ordered_fullname) LIMIT 1
)[OFFSET(0)].*
FROM notrmalized_fuzzylog l
CROSS JOIN normalized_names n
GROUP BY 1, 2
-- ORDER BY 1
with result:
Row id logged_fullname nameid fullname
1 1 John Smith 1 John Smith
2 2 Jane Doe 2 Jane Doe
3 3 Ms. Jane Doe 2 Jane Doe
4 4 Mr. John Smith 1 John Smith
5 5 Smith, John 1 John Smith
6 6 J.Smith 1 John Smith
7 7 J. Doe 2 Jane Doe
As you can see in this solution we totally ignore/remove use of any extra artificial columns (like contains
) and rather applying Levenshtein distance to measure similarity directly between two fullnames. And as you can see before doing this we reorder/normalizing fullnames to order their parts
If this approach will work for you - you should consider improving that reordering by first removing/replacing all punctuation like dots, commas, etc with space for better result
回答2:
You can get this to work with arrays and correlated subqueries:
with fuzzylog as (
select 1 as id, 'John Smith' as fullname union all
select 2 as id, 'Jane Doe' UNION ALL
select 6 as id, 'J. Smith'
),
names as (
select 1 as nameid, 'John Smith' as fullname, 'smith' as word
)
select l.id, l.fullname, n.fullname as name_fullname, n.nameid
from (SELECT l.*,
(SELECT array_agg(n.nameid)
from names n
where l.fullname = n.fullname OR lower(l.fullname) LIKE CONCAT('%', lower(n.word), '%')
) nameids
FROM fuzzylog l
) l LEFT JOIN
unnest(l.nameids) the_nameid left join
names n
on n.nameid = the_nameid;
来源:https://stackoverflow.com/questions/53198374/how-to-use-string-functions-in-the-on-clause-of-a-left-join-with-standard-sql-in