proc sql vs data step for looking up values form a reference table that includes exceptions

前端 未结 3 1909
-上瘾入骨i
-上瘾入骨i 2021-01-25 18:39

I am trying to find out tax values for a particular good in a particular city in a particular state. Tax values are in a reference table like this:

state    city         


        
相关标签:
3条回答
  • 2021-01-25 19:22

    SQL is ideal the ideal tool to join these tables as it is the most flexible at joining data.
    Using DomPazz's test data;

    data taxes;
    informat state $8.   
             city $12.     
             Good $12.    
             tax best.;
    input state $ city $ good $ tax;
    datalines;
    all      all      all      0.07
    all      all      chicken  0.04
    all      jackson  all      0.01
    arizona  all      meat     0.02
    arizona  phoenix  meat     0.04
    arizona  tucson   meat     0.03
    hawaii   all      all      0.08
    hawaii   all      chicken  0.11
    nevada   reno     cigar    0.11
    nevada   vegas    cigar    0.13
    ;;;
    run;
    
    data to_look_up;
    informat lu_state $8.   
             lu_city $12.     
             lu_Good $12.  ;
    input lu_state $ lu_city $ lu_good $;
    datalines;
    nevada reno cigar
    nevada reno chicken
    hawaii honalulu chicken
    texas  dallas steak
    ;;;
    run;
    

    The query below joins each row in the to_look_up table to rows in the taxes table where; state matches or state equals 'all' in the taxes table, city matches or city equals 'all' in the taxes table, and good matches or good equals 'all' in the taxes table.

    This can cause more than 1 row in the taxes table to match a row in the to_look_up table. Though we can select the best match by prioritising matches i.e. match state before state equals 'all' and the same for city and good.

    The Group By clause is important here. It should be the unique combination of variables in the to_look_up table. With this we can select the best match for each row in the to_look_up table and eliminate all other matches.

    proc sql;
    create table taxes_applied  as
    
    select  *
    
    /*  Prioritise state, city and good matches.                   */
    ,   case    when to_look_up.lu_state    eq  taxes.state then 2
                when 'all'                  eq  taxes.state then 1
        end                                 as  match_state
    
    ,   case    when to_look_up.lu_city     eq  taxes.city  then 2
                when 'all'                  eq  taxes.city  then 1
        end                                 as  match_city
    
    ,   case    when to_look_up.lu_good     eq  taxes.good  then 2
                when 'all'                  eq  taxes.good  then 1
        end                                 as  match_good
    
    from    to_look_up
    
    /*  join taxes table on matching state, city and good or matching 'all' rows.  */
    left    join
        taxes
    on  (       to_look_up.lu_state eq  taxes.state
            or  'all'               eq  taxes.state
        )
    and (       to_look_up.lu_city  eq  taxes.city
            or  'all'               eq  taxes.city
        )   
    and (       to_look_up.lu_good  eq  taxes.good
            or  'all'               eq  taxes.good
        )   
    
    
    /*  Process for each row in to_look_up table.  */ 
    group   by  to_look_up.lu_state
            ,   to_look_up.lu_city
            ,   to_look_up.lu_good
    
    /*  Select best match.   */ 
    having  match_state eq  max (match_state)
    and     match_city  eq  max (match_city)         
    and     match_good  eq  max (match_good)
    
    order   by  to_look_up.lu_state
            ,   to_look_up.lu_city
            ,   to_look_up.lu_good
            ,   match_state
            ,   match_city
            ,   match_good      
    ;
    
    quit;       
    

    Joins similar to this can be used to generate sub-totals in summary tables.

    0 讨论(0)
  • 2021-01-25 19:23

    This is a bit long. I use a hash object in these situations. Iteratively "if/then/else" your way through the look up tree attempting to find a value.

    I assume Honolulu chicken should be in "Hawaii all chicken" and not "all all chicken."

    I included a macro I use for creating the hash object. This uses your data, a set up things to look up and creates and output table with the looked up taxes.

    data taxes;
    informat state $8.   
             city $12.     
             Good $12.    
             tax best.;
    input state $ city $ good $ tax;
    datalines;
    all      all      all      0.07
    all      all      chicken  0.04
    all      jackson  all      0.01
    arizona  all      meat     0.02
    arizona  phoenix  meat     0.04
    arizona  tucson   meat     0.03
    hawaii   all      all      0.08
    hawaii   all      chicken  0.11
    nevada   reno     cigar    0.11
    nevada   vegas    cigar    0.13
    ;;;
    run;
    
    data to_look_up;
    informat lu_state $8.   
             lu_city $12.     
             lu_Good $12.  ;
    input lu_state $ lu_city $ lu_good $;
    datalines;
    nevada reno cigar
    nevada reno chicken
    hawaii honalulu chicken
    texas  dallas steak
    ;;;
    run;
    
    %macro create_hash(name,key,data_vars,dataset);
    declare hash &name(dataset:&dataset);
    %local i n d;
    %let n=%sysfunc(countw(&key));
    rc = &name..definekey(
        %do i=1 %to %eval(&n-1);
        "%scan(&key,&i)",
        %end;
        "%scan(&key,&i)"
    );
    %let n=%sysfunc(countw(&data_vars));
    %do i=1 %to &n;
        %let d=%scan(&data_vars,&i);
        rc = &name..definedata("&d");
    %end;
    rc = &name..definedone();
    %mend;
    
    data lookup;
    set to_look_up;
        format tax best.
             state $8.   
             city $12.     
             Good $12. ;
    
        if _N_ = 1 then do;
            %create_hash(scg,state city good, tax,"taxes");
        end;
    
        state = lu_state;
        city =  lu_city;
        good = lu_good;
        tax = .;
    
        rc = scg.find();
        if missing(tax) then do;
            /*No exact match - check if state/good combo exists*/   
            city = "all";
            rc = scg.find();
            if missing(tax) then do;
                /*No state/good combo -- check state only taxes*/
                good = "all";
                rc = scg.find();
                if missing(tax) then do;
                    /*Check good only*/
                    good = lu_good;
                    state = "all";
                    rc = scg.find();
                    if missing(tax) then do;
                        /*Default taxes*/
                        good = "all";
                        rc = scg.find();
                    end;
                end;
            end;
        end;
    run;
    
    0 讨论(0)
  • 2021-01-25 19:23

    If its something that you only need to do once(i mean not an ongoing process) , then probably a easy way out could be dividing ur dataset into multiple datasets. One dataset would have all observations that have all 'all's in state,observation and good. Another one would have only state or city or good only as All. Another dataset would be a combination of two ALLs in either state/city , city/good or state/good. Making a total of 8 datasets i guess(including a dataset for no Alls in any of the variables. Then when you know which variables has alls , you can merge accordingly . For example - For a dataset with state , city , good u can have a tax of 0.07 without any merge. For a dataset with state and city = 'All' you only need to merge on good. Only other way/option of doing this imo would be to create three new datasets having two variables where var1 = all in all cases and var2 = all city names(multiple obs)/ all state names(multiple obs)/all goods names(multiple obs) and then merge to ur original dataset on var1 to have multiple rows in ur original dataset instead of having ALLs

    0 讨论(0)
提交回复
热议问题