How to convert an array extracted from a json string field to a bigquery Repeated field?

后端 未结 3 741
囚心锁ツ
囚心锁ツ 2021-01-01 23:38

We have loaded json blobs in a String field in a Bigquery table. I need to create a view (using standard sql)over the table that would extract the array field as a bigquery

相关标签:
3条回答
  • 2021-01-02 00:03

    As of 1st May 2020, JSON_EXTRACT_ARRAY function has been added, and can be used to retrieve array from json.

    #standardSQL
    WITH `yourTable` AS (
      SELECT '{"order_id":"123456","customer_id":"2abcd", "items":[{"line":"1","ref_ids":["66b56e60","9e7ca2b7"],"sku":"1111","amount":40 },{"line":"2","ref_ids":["7777h0","8888j0"],"sku":"2222","amount":10 }]}' AS json_blob 
    )
    SELECT
      json_extract_scalar(json_blob,'$.order_id') AS order_id,
      json_extract_scalar(json_blob,'$.customer_id') AS customer_id,
      ARRAY(
      SELECT
        STRUCT(json_extract_scalar(split_items,'$.line') AS line,
              ARRAY(SELECT json_extract_scalar(ref_element,'$') FROM UNNEST(json_extract_array(split_items, '$.ref_ids')) ref_element) AS ref_ids,
              json_extract_scalar(split_items,'$.sku') AS sku,
              json_extract_scalar(split_items,'$.amount') AS amount 
          )
        FROM UNNEST(json_extract_array(json_blob,'$.items')) split_items 
      ) AS items
    FROM
      `yourTable`
    

    Returns:

    To get only the type query would be:

    #standardSQL
    WITH `yourTable` AS (
      SELECT '{ "firstName": "John", "lastName" : "doe", "age"      : 26, "address"  : {     "streetAddress": "naist street",     "city"         : "Nara",     "postalCode"   : "630-0192" }, "phoneNumbers": [     {       "type"  : "iPhone",       "number": "0123-4567-8888"     },     {       "type"  : "home",       "number": "0123-4567-8910"     } ]}' AS json_blob 
    )
      SELECT
        json_extract_scalar(split_items,'$.type') AS type FROM `yourTable`, UNNEST(json_extract_array(json_blob,'$.phoneNumbers')) split_items
    

    returns:

    0 讨论(0)
  • 2021-01-02 00:04

    There is no way to do this using SQL functions in BigQuery at the time of this writing unless you can impose a hard limit on the number of values in the JSON array; see the relevant issue tracker item. Your options are:

    • Process the data differently (e.g. using Cloud Dataflow or another tool) so that you can load it from newline-delimited JSON into BigQuery.
    • Use a JavaScript UDF that takes the input JSON and returns the desired type; this is fairly straightforward but generally uses more CPU (and hence may require a higher billing tier).
    • Use SQL functions with the understanding that the solution breaks down if there are too many elements.

    Here is the approach using a JavaScript UDF:

    #standardSQL
    CREATE TEMP FUNCTION JsonToItems(input STRING)
    RETURNS STRUCT<order_id INT64, customer_id STRING, items ARRAY<STRUCT<line STRING, ref_ids ARRAY<STRING>, sku STRING, amount INT64>>>
    LANGUAGE js AS """
    return JSON.parse(input);
    """;
    
    WITH Input AS (
      SELECT '{"order_id":"123456","customer_id":"2abcd", "items":[{"line":"1","ref_ids":["66b56e60","9e7ca2b7"],"sku":"1111","amount":40 },{"line":"2","ref_ids":["7777h0","8888j0"],"sku":"2222","amount":10 }]}' AS json
    )
    SELECT
      JsonToItems(json).*
    FROM Input;
    

    If you do want to try the SQL-based approach without JavaScript, here's somewhat of a hack until the feature request above is resolved, where the number of array elements must be no more than 10:

    #standardSQL
    CREATE TEMP FUNCTION JsonExtractRefIds(json STRING) AS (
      (SELECT ARRAY_AGG(v IGNORE NULLS)
       FROM UNNEST([
         JSON_EXTRACT_SCALAR(json, '$.ref_ids[0]'),
         JSON_EXTRACT_SCALAR(json, '$.ref_ids[1]'),
         JSON_EXTRACT_SCALAR(json, '$.ref_ids[2]'),
         JSON_EXTRACT_SCALAR(json, '$.ref_ids[3]'),
         JSON_EXTRACT_SCALAR(json, '$.ref_ids[4]'),
         JSON_EXTRACT_SCALAR(json, '$.ref_ids[5]'),
         JSON_EXTRACT_SCALAR(json, '$.ref_ids[6]'),
         JSON_EXTRACT_SCALAR(json, '$.ref_ids[7]'),
         JSON_EXTRACT_SCALAR(json, '$.ref_ids[8]'),
         JSON_EXTRACT_SCALAR(json, '$.ref_ids[9]')]) AS v)
    );
    
    CREATE TEMP FUNCTION JsonToItem(json STRING)
    RETURNS STRUCT<line STRING, ref_ids ARRAY<STRING>, sku STRING, amount INT64>
    AS (
      IF(json IS NULL, NULL,
        STRUCT(
          JSON_EXTRACT_SCALAR(json, '$.line'),
          JsonExtractRefIds(json),
          JSON_EXTRACT_SCALAR(json, '$.sku'),
          CAST(JSON_EXTRACT_SCALAR(json, '$.amount') AS INT64)
        )
      )
    );
    
    CREATE TEMP FUNCTION JsonToItems(json STRING) AS (
      (SELECT AS STRUCT
        CAST(JSON_EXTRACT_SCALAR(json, '$.order_id') AS INT64) AS order_id,
        JSON_EXTRACT_SCALAR(json, '$.customer_id') AS customer_id,
        (SELECT ARRAY_AGG(v IGNORE NULLS)
         FROM UNNEST([
           JsonToItem(JSON_EXTRACT(json, '$.items[0]')),
           JsonToItem(JSON_EXTRACT(json, '$.items[1]')),
           JsonToItem(JSON_EXTRACT(json, '$.items[2]')),
           JsonToItem(JSON_EXTRACT(json, '$.items[3]')),
           JsonToItem(JSON_EXTRACT(json, '$.items[4]')),
           JsonToItem(JSON_EXTRACT(json, '$.items[5]')),
           JsonToItem(JSON_EXTRACT(json, '$.items[6]')),
           JsonToItem(JSON_EXTRACT(json, '$.items[7]')),
           JsonToItem(JSON_EXTRACT(json, '$.items[8]')),
           JsonToItem(JSON_EXTRACT(json, '$.items[9]'))]) AS v) AS items
      )
    );
    
    WITH Input AS (
      SELECT '{"order_id":"123456","customer_id":"2abcd", "items":[{"line":"1","ref_ids":["66b56e60","9e7ca2b7"],"sku":"1111","amount":40 },{"line":"2","ref_ids":["7777h0","8888j0"],"sku":"2222","amount":10 }]}' AS json
    )
    SELECT
      JsonToItems(json).*
    FROM Input;
    
    0 讨论(0)
  • 2021-01-02 00:15

    A little bit more brute-force version - I think easier to read and modify/adjust if needed

    #standardSQL
    WITH `yourTable` AS (
      SELECT '{"order_id":"123456","customer_id":"2abcd", "items":[{"line":"1","ref_ids":["66b56e60","9e7ca2b7"],"sku":"1111","amount":40 },{"line":"2","ref_ids":["7777h0","8888j0"],"sku":"2222","amount":10 }]}' AS json_blob
    )
    SELECT 
       JSON_EXTRACT_SCALAR(json_blob, '$.order_id') AS order_id,
       JSON_EXTRACT_SCALAR(json_blob, '$.customer_id') AS customer_id,
       ARRAY(
        SELECT STRUCT(
            JSON_EXTRACT_SCALAR(split_items, '$.line') AS line,
            SPLIT(REGEXP_REPLACE(JSON_EXTRACT (split_items, '$.ref_ids'), r'[\[\]\"]', '')) AS ref_ids,
            JSON_EXTRACT_SCALAR(split_items, '$.sku') AS sku,
            JSON_EXTRACT_SCALAR(split_items, '$.amount') AS amount
          )
        FROM (
          SELECT CONCAT('{', REGEXP_REPLACE(split_items, r'^\[{|}\]$', ''), '}') AS split_items
          FROM UNNEST(SPLIT(JSON_EXTRACT(json_blob, '$.items'), '},{')) AS split_items
        )
       ) AS items
    FROM `yourTable` 
    
    0 讨论(0)
提交回复
热议问题