Jaro-Winkler string comparison function in SAS

白昼怎懂夜的黑 提交于 2019-12-06 12:33:59

There is no built in function for jaro-winkler distance that I am aware of. @Itzy already reference the only ones that I know of. You can roll you own functions with proc fcmp though if you feel up to it. I'll even give you a head start with the code below. I just tried to follow the wikipedia article on it. It certainly isn't close to being a perfect representation of Bill Winkler's strcmp.c file by any means and likely has lots of bugs.

proc fcmp outlib=work.jaro.chars;

  subroutine jaromatch ( string1 $ , string2 $ , matchChars $);
    outargs matchChars;
    /* Returns number of matched characters between 2 strings excluding blanks*/
    /* two chars from string1 and string2 are considered matching
       if they are no farther than floor(max(|s1|, |s2|)/2)-1  */

    str1_len = length(strip(string1));
    str2_len = length(strip(string2));

    allowedDist = floor(max(str1_len, str2_len)/2) -1;

    matchChars="";

    /* walk through string 1 and match characters to string2 */
    do i= 1 to str1_len;
      x=substr(string1,i,1);
      position = findc(string2,x ,max(1,i-allowedDist));
      if position > 0 then do;
          if position - i <= allowedDist then do;
          y=substr(string2,position,1);
          /* build list of matched characters */
          matchChars=cats(matchChars,y);
        end;
      end;
    end;
    matchChars = strip(matchChars);
  endsub;


  function jarotrans (string1 $ , string2 $ );
    ntrans = 0;
    ubnd = min(length(strip(string1)), length(strip(string2)));
    do i = 1 to ubnd;
      if substr(string1,i,1) ne substr(string2,i,1) then do;
        ntrans + 1;
      end;
    end;
    return(ntrans/2);
  endsub;

  function getPrefixlen( string1 $ , string2 $, maxprelen);
     /* get the length of the matching characters at the beginning */
     n = min(maxprelen, length(string1), length(string2));
     do i = 1 to n;
       if substr(string1,i,1) ne substr(string2,i,1) 
       then return(max(1,i-1));
     end;
  endsub;

  function jarodist(string1 $, string2 $);
    /* get number of matched characters */
    call jaromatch(string1, string2, m1);
    m1_len = length(m1);
    if m1_len = 0 then return(0);
    call jaromatch(string2, string1, m2);
    m2_len = length(m2);
    if m2_len = 0 then return(0);

    /* get number of transposed characters */
    ntrans = jarotrans(m1, m2);
    put m1_len= m2_len= ntrans= ;
    j_dist =  (m1_len/length(string1) 
             + m2_len/length(string2) 
             + (m1_len-ntrans)/m1_len )  /  3;
    return(j_dist);
  endsub;

  function jarowink( string1 $, string2 $, prefixscale);
    jarodist=jarodist(string1, string2);
    prelen=getPrefixlen(string1, string2, 4);
    if prelen = 0 then return(jarodist);
    else  return(jarodist + prelen * prefixscale * (1-jarodist));
  endsub;

run;quit;

/* tell SAS where to find the functions we just wrote */
option cmplib=work.jaro;

/* Now let's try it out! */
data _null_;
string1='DIXON';
string2='DICKSONX';
x=jarodist(string1, string2);
y=jarowink(string1, string2, 0.1);
put x= y=;
run;

I modified and corrected cmjohns' code. Thanks to him/her for starting me off. Winkler published some examples in his paper Winkler, W. E. (2006). "Overview of Record Linkage and Current Research Directions". Research Report Series, RRS. (See table 6) I used the examples to test my code.

proc fcmp outlib=work.jaro.chars;

  /* Returns matched characters between 2 strings. Two chars from string1 and string2
     are considered matching if they are no farther apart than 
     floor(max(|s1|, |s2|)/2)-1                                                      */
  function jaromatch(string1 $, string2 $) $ 40;
    length matchChars $ 40;

    str1_len = lengthn(string1);
    str2_len = lengthn(string2);

    allowedDist = floor(max(str1_len, str2_len) / 2) - 1;

    *** walk through string1 and match characters to string2 ***;
    matchChars="";
    do i= 1 to str1_len;
      *** get the part of string2 to search ***;
      allowed_start = max(1, i - allowedDist);      *** starting char position ***;
      allowed_str2 = substr(string2, allowed_start, i + allowedDist - allowed_start + 1);

      *** find i char from string1 in string2 within the allowedDist ***;
      position = findc(allowed_str2, substr(string1, i, 1));
      if position > 0 
      then do;
    matchChars = cats(matchChars, substr(allowed_str2, position, 1));
    *** Once a char is assigned, it can not be assigned again. So, chg char in string2. ***;
    substr(string2, allowed_start + position -1, 1) = '~';
      end;
    end;
    return(strip(matchChars));
  endsub;

  /* count the number of "half" transpositions */
  function jarotrans(string1 $, string2 $);
    ntrans = 0;
    do i = 1 to min(lengthn(strip(string1)), lengthn(strip(string2)));
      if substr(string1, i, 1) ne substr(string2, i, 1) then ntrans + 1;
    end;

    return(ntrans / 2);
  endsub;

  /* get the length of the matching characters at the beginning */
  function getPrefixlen(string1 $, string2 $, maxprelen);
    n = min(maxprelen, lengthn(string1), lengthn(string2));

    if n = 0
    then return(0);
    else do;
      do i = 1 to n;
    if substr(string1, i, 1) ne substr(string2, i, 1) 
    then return(i - 1);
      end;
      return(n);  *** all maxprelen characters match ***;
    end;
  endsub;

  /* calc the jaro distance */
  function jarodist(string1 $, string2 $);
    *** get number of matched characters in string1 ***;
    m1 = jaromatch(string1, string2);
    m1_len = lengthn(m1);
    if m1_len = 0 then return(0);

    *** get number of matched characters in string2 ***;
    m2 = jaromatch(string2, string1);
    m2_len = lengthn(m2);
    if m2_len = 0 then return(0);

    *** get number of transposed characters ***;
    ntrans = jarotrans(m1, m2);

    *** calc jaro distance ***;
    j_dist = (m1_len / lengthn(string1) +
          m2_len / lengthn(string2) +
          (m1_len - ntrans) / m1_len
         ) / 3;

    return(j_dist);
  endsub;

  /* calc the jaro-winkler distance */
  function jarowink(string1 $, string2 $, prefixscale);
    string1 = upcase(strip(string1));
    string2 = upcase(strip(string2));

    *** check for trivial case and calc JW if needed ***;
    if string1 = string2
    then return(1.0);
    else do;
      jarodist = jarodist(string1, string2);
      prelen = getPrefixlen(string1, string2, 4);
      return(jarodist + prelen * prefixscale * (1 - jarodist));
    end;
  endsub;

run;

*** tell SAS where to find the functions we just wrote ***;
option cmplib=work.jaro;

    /* test code */
data _null_;
  put 'SHACKLEFORD SHACKELFORD 0.982';
  jw = jarowink('SHACKLEFORD', 'SHACKELFORD', 0.1);
  put jw=;
  put;
  put 'DUNNINGHAM  CUNNIGHAM   0.896';
  jw = jarowink('DUNNINGHAM', 'CUNNIGHAM', 0.1);
  put jw=;
  put;
  put 'NICHLESON   NICHULSON   0.956';
  jw = jarowink('NICHLESON', 'NICHULSON', 0.1);
  put jw=;
  put;
  put 'JONES       JOHNSON     0.832';
  jw = jarowink('JONES', 'JOHNSON', 0.1);
  put jw=;
  put;
  put 'MASSEY      MASSIE      0.933';
  jw = jarowink('MASSEY', 'MASSIE', 0.1);
  put jw=;
  put;
  put 'ABROMS      ABRAMS      0.922';
  jw = jarowink('ABROMS', 'ABRAMS', 0.1);
  put jw=;
  put; 
  put 'JERALDINE   GERALDINE   0.926';
  jw = jarowink('JERALDINE', 'GERALDINE', 0.1);
  put jw=;
  put;
  put 'MARHTA      MARTHA      0.961';
  jw = jarowink('MARHTA', 'MARTHA', 0.1);
  put jw=;
  put;
  put 'MICHELLE    MICHAEL     0.921';
  jw = jarowink('MICHELLE', 'MICHAEL', 0.1);
  put jw=;
  put;
  put 'JULIES      JULIUS      0.933';
  jw = jarowink('JULIES', 'JULIUS', 0.1);
  put jw=;
  put;
  put 'TANYA       TONYA       0.880';
  jw = jarowink('TANYA', 'TONYA', 0.1);
  put jw=;
  put;
  put 'DWAYNE      DUANE       0.840';
  jw = jarowink('DWAYNE', 'DUANE', 0.1);
  put jw=;
  put;
  put 'SEAN        SUSAN       0.805';
  jw = jarowink('SEAN', 'SUSAN', 0.1);
  put jw=;
  put;
  put 'JON         JOHN        0.933';
  jw = jarowink('JON', 'JOHN', 0.1);
  put jw=;
  put;
run;

I don't think so. It can do the Levenshtein distance (the complev function) or a generalized edit distance (compged), but I haven't seen any other edit distance functions.

If you're dead set on doing this in SAS you could write a program in PROC IML.

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!