Select character variables that have all missing values

前端 未结 4 2134
一整个雨季
一整个雨季 2021-01-19 05:08

I have a SAS dataset with around 3,000 variables, and I would like to get rid of the character variables for which all values are missing. I know how to do this for numeric

相关标签:
4条回答
  • 2021-01-19 05:53

    Rob and cmjohns, thank you SO MUCH for your help. Based on your solutions and an idea I had over the weekend, here is what I came up with:

    %macro removeEmptyCols(origDset, outDset);
        * get the number of obs in the original dset;
        %let dsid  = %sysfunc(open(&origDset));
        %let origN = %sysfunc(attrn(&dsid, nlobs));
        %let rc    = %sysfunc(close(&dsid));
    
        proc transpose data= &origDset out= transpDset;
            var _all_;
        run;
    
        data transpDset;
            set transpDset;
            * proc transpose converted all old vars to character,
              so the . from old numeric vars no longer means 'missing';
            array oldVar_ _character_;
            do over oldVar_;
                if strip(oldVar_) = "." then oldVar_ = "";
            end;
    
            * each row from the old dset is now a column with varname starting with 'col';
            numMiss = cmiss(of col:);
    
            numCols = &origN;
        run;
    
        proc sql noprint;
          select _NAME_ into: varsToKeep separated by ' '
          from transpDset
          where numMiss < numCols;
        quit;
    
        data &outDset;
            set &origDset (keep = &varsToKeep);
        run;
    %mend removeEmptyCols;
    

    I will try all 3 ways and report back on which one is fastest...

    P.S. added 23 Dec 2010 for future reference: SGF Paper 048-2010: Dropping Automatically Variables with Only Missing Values

    0 讨论(0)
  • 2021-01-19 05:53

    This is very simple method useful for all variables

    proc freq data=class nlevels ;
    ods output nlevels=levels(where=(nmisslevels>0 and nnonmisslevels=0));
    run;
    
    proc sql noprint;
    select TABLEVAR into :_MISSINGVARS separated by ' ' from levels;
    quit;
    data want;
    set class (keep=&_MISSINGVARS);
    run;
    
    0 讨论(0)
  • 2021-01-19 05:57

    I created a macro that will check for empty character columns and either remove them from the original or create a new data set with the empty columns removed. It takes two optional arguments: The name of the data set (default is the most recently created data set), and a suffix to name the new copy (set suffix to nothing to edit the original).

    It uses proc freq with the levels option and a custom format to determine the empty character columns. proc sql is then used to create a list of the columns to be removed and store them in a macro variable.

    Here is the macro:

    %macro delemptycol(ds=_last_, suffix=_noempty);
    
    option nonotes;
    proc format;
      value $charmiss
        ' '= ' '
        other='1';
    run;
    %if "&ds"="_last_" %then %let ds=&syslast.;
    
    ods select nlevels;
    ods output nlevels=nlev;
    proc freq data=&ds.(keep=_character_) levels ;
      format _character_ $charmiss.;
    run;
    ods output close;
    
    /* create macro var with list of cols to remove */
    %local emptycols;
    proc sql noprint;
      select tablevar into: emptycols separated by ' '
      from nlev
      where NNonMissLevels=0;
    quit;
    
    %if &emptycols.=  %then %do;
      %put DELEMPTYCOL: No empty character columns were found in data set &ds.;
      %end;
    %else %do;
      %put DELEMPTYCOL: The following empty character columns were found in data set &ds. : &emptycols.;
      %put DELEMPTYCOL: Data set &ds.&suffix created with empty columns removed;
      data &ds.&suffix. ;
        set &ds(drop=&emptycols);
      run;
    %end;
    options notes;
    
    %mend;
    

    Examples usage:

    /* create some fake data: Here char5 will be empty */
    data chardata(drop= j randnum);
    length char1-char5 $8.;
    array chars(5) char1-char5;
      do i=1 to 100;
        call missing(of char:);
        randnum=floor(10*ranuni(i));
        do j=2 to 5;
          if (j-1)<randnum<=(j+1) then chars(j-1)="FOO";
        end;
        output;
      end;
    run;
    
    %delemptycol();  /* uses default _last_ for the data and "_noempty" as the suffix */
    %delemptycol(ds=chardata, suffix=); /* removes the empty columns from the original */
    
    0 讨论(0)
  • 2021-01-19 06:06

    There's probably a simpler way but this is what I came up with.

    Cheers Rob

    EDIT: Note that this works for both character and numeric variables.

    **
    ** TEST DATASET
    *;
    data x;
      col1 = "a"; col2 = ""; col3 = "c"; output;
      col1 = "" ; col2 = ""; col3 = "c"; output;
      col1 = "a"; col2 = ""; col3 = "" ; output;
    run;
    
    **
    ** GET A LIST OF VARIABLE NAMES
    *;
    proc sql noprint;
      select name into :varlist separated by " " 
      from sashelp.vcolumn
      where upcase(libname) eq "WORK" 
        and upcase(memname) eq "X";
    quit;
    
    %put &varlist;
    
    
    **
    ** USE A MACRO TO CREATE A DATASTEP.  FOR EACH COLUMN THE 
    ** THE DATASTEP WILL CREATE A NEW COLUMN WITH THE SAME NAME
    ** BUT PREFIXED WITH "DELETE_".  IF THERE IS AT LEAST 1 
    ** NON-MISSING VALUE FOR THE COLUMN THEN THE "DELETE" COLUMN
    ** WILL FINISH WITH A VALUE OF 0, ELSE 1.  WE WILL ONLY
    ** KEEP THE COLUMNS CALLED "DELETE_" AND OUTPUT ONLY A SINGLE
    ** OBSERVATION TO THE FINAL DATASET.
    *;
    %macro find_unused_cols(iDs=);
      %local cnt;
    
        data vars_to_delete;
          set &iDs end=eof;
    
          %let cnt = 1;
          %let varname = %scan(&varlist, &cnt);
          %do %while ("&varname" ne "");
            retain delete_&varname;
            delete_&varname = min(delete_&varname, missing(&varname));
            drop &varname;
            %let cnt = %eval(&cnt + 1);
            %let varname = %scan(&varlist, &cnt);
          %end;
    
          if eof then do;
            output;
          end;
    
        run;
    
    %mend;
    %find_unused_cols(iDs=x);
    
    **
    ** GET A LIST OF VARIABLE NAMES FROM THE NEW DATASET
    ** THAT WE WANT TO DELETE AND STORE TO A MACRO VAR.
    *;
    proc transpose data=vars_to_delete out=vars_to_delete;
    run;
    
    proc sql noprint;
      select substr(_name_,8) into :vars_to_delete separated by " " 
      from vars_to_delete
      where col1;
    quit;
    
    %put &vars_to_delete;
    
    
    **
    ** CREATE A NEW DATASET CONTAINING JUST THOSE VARS
    ** THAT WE WANT TO KEEP
    *;
    data new_x;
      set x;
      drop &vars_to_delete;
    run;
    
    0 讨论(0)
提交回复
热议问题