/*** TIP 00392 ***/
/*** Authors: Charles Patridge and Sigurd Hermansen ***/
/*** An example of a Fuzzy Logic Application - Matching Two Files with Imperfect SSN***/
/*** SSN is Social Security Number                                                  ***/
/*** Will use the SPEDIS and SOUNDEX functions to assist in determining "likeness"  ***/
/*** in matching imperfect SSNs                                                     ***/
/*** A fictitious example                                                           ***/

/*** This First example (fuzzy001) will attempt to update the Amaster File with
     the INCOME field from Btrans File, including those records that may have a
     mistyped or like SSN ***/

/*** We will assume we only have access to SSN and Lastnames for determining "likeness"
     in this example - otherwise you may be cheating ***/

/*** And finally, this example should be complete enough to address all aspects of a real
     life application - backup of master file, list of exact matches, list of similiar
     likeness, update master file, list of transactions not used for any updates - no matches
     or likeness

     That is, a novice developer should appreciate the various aspects of a complete
     application ***/


%let spedis = 20; /*** choose a spedis value to determine what costs you wish to accept.  ***/
                  /*** the lower the cost value the more likely you may have a good match ***/
                  /*** it is highly recommended to read up on the SPEDIS function and to  ***/
                  /*** test this function with a variety of data to become familiar with  ***/
                  /*** its strengths and weaknesses.                                      ***/

%let step = 0; /*** for documentation only - see below - not needed for the application ***/

ods pdf; /*** open the ODS for creating a PDF document, if needed ***/

/*** I have named the SAS Datasets starting with the Letter "A" so it will be easier to
     step through all the SAS Datasets in your LIB Window to see the results, step-by step
     We have also provided a Listing of the Data as well for printing purposes, if needed ***/


/*** Master File ***/
/*** assume this data to be as accurate as possible and not to be changed by TRANS,
     we only wish to add INCOME from TRANS to this file, Master                ***/
Data Amaster;
 infile cards missover;
 input
    @1   SSN        $ 11.
    @13  Lastname   $ 15.
    @28  Firstname  $ 15.
    @43  Address    $ 20.
    @63  Town       $ 15.
    @78  State      $  2.
    @81  Zip        $  5.
    @87  Birthdate  mmddyy10.
 ;
format Birthdate date9.;

/*** Ruler to read the data correctly
         1         2         3         4         5         6         7         8         9
123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456 ***/
cards;
023-33-3207 Patridge       Charles        172 Monce Road      Burlington     CT 06013 05/07/1953
043-44-6401 Whitlock       Ian            One State Street    Baltimore      MD 21201 06/03/1952
053-35-4467 Dorfman        Paul           756 Hyatt drive     Jacksonville   FL 32099 03/15/1954
006-24-2809 Ellingsworth   Marty          57 Cottage Drive    San Fran       CA 94102 07/04/1955
053-19-0067 Hermansen      Sigurd         378 Pleasant Terr   Rockland       MD 21286 08/23/1955
;;;;
run;

%let step = %eval( &step + 1 ); /*** My way to document each step of this demo ***/
Title1 "&Step - AMASTER File Records";
Proc Print data=AMaster; run;

/*** Transaction File ***/
/*** we need to take INCOME from this file and add it to Master ***/
/*** for our purposes in this program, we will assume we only have access to TSSN and
     TLastName for matching purposes ***/
Data Btrans;
 infile cards missover;
 input
    @1   tSSN        $ 11.
    @13  tLastname   $ 15.
    @28  tFirstname  $ 15.
    @43  tAddress    $ 20.
    @63  tTown       $ 15.
    @78  tState      $  8.
    @87  tZip        $  5.
    @93  tBirthdate  mmddyy10.
    @104 Income        6.
 ;

format tBirthdate date9.;
/*** Ruler to read the data correctly
         1         2         3         4         5         6         7         8         9         10
1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789 ***/
cards;
023-33-3207 Patridge       Charles        172 Monce Road      Burlington     CT       06013 05/07/1953 97500
033-33-3207 Partridge      Chas           Monce Road          Unionville     CT       06085 05/07/1953 95755
043-45-6410 Whitlock       Ian            1 State Street      Baltimore      Maryland 21201 03/06/1952 105250
053-35-4467 Dforfman       Paul           756-A Hyatt Drive   Jacksonville   FL       32098 03/15/1954 118500
035-35-5601 Crawford       Peter          Two Cambridge Plaza London         UK             02/29/1952 110000
006-42-2809 Ellingsworth   Martin         57 Cottage Drive    San Francisco  CA       94102 07/04/1956 135325
;;;;
run;

/*** comments about Amaster and Btrans data set ***/
/*** record 1 Btrans has exact match to Amaster Dataset ***/
/*** record 2 Btrans has incorrect 2nd digit in SSN, Lastname sounds similiar, first name is abbr, address without street #
     town is different, and zip is slightly different ***/
/*** record 3 Btrans has incorrect 5th digit, and 8th and 9th digits transposed of SSN,
     address has 1 vs One, state with Maryland vs MD, correct zip,
     month and day of birthdate is reversed ***/
/*** record 4 Btrans has correct SSN, lastname misspelled, street number slightly different, zip off by last digit,
     same birthdate ***/
/*** record 5 Btrans is not in Amaster dataset - no chance for matching ***/
/*** record 6 Btrans digits 4 and 5 of SSN is tranposed, Firstname is nick name, Town is full name,
     same zipcode, birthyear is off by 1 year ***/
/*** record 5 of Amaster is not in Btrans ***/

%let step = %eval( &step + 1 );
Title1 "&Step - BTRANS File Records";
Proc Print data=BTRANS; run;

/*** First, let us assume we only have SSN to match on ***/

data Cmaster1;
  set Amaster;
  /*** eliminate certain characters to clean up field and compress blanks out ***/
  /*** let us deal only with numeric digits but still a character field ***/
  ssn = compress(translate( ssn, '', '-/ ,' ));
run;

%let step = %eval( &step + 1 );
Title1 "&Step - CMASTER1 File Records";
Proc Print data=CMASTER1; run;

data Dtrans1;
  set Btrans;
  /*** eliminate certain characters to clean up field and compress blanks out ***/
  /*** let us deal only with numeric digits but still a character field ***/
  tssn = compress(translate( tssn, '', '-/ ,' ));
run;

%let step = %eval( &step + 1 );
Title1 "&Step - DTRANS1 File Records";
Proc Print data=DTRANS1; run;

/*** now let us match on exact matches from DTRANS1 to AMaster and separate them out
     of TRANS1 so we can concentrate on the "like" possible matches ***/

proc sort data=Dtrans1  out=Dtrans1;  by tssn; run;
proc sort data=Cmaster1 out=Cmaster1; by  ssn; run;

data Ematch   /*** only those that match ***/
     Funmatch /*** those that do not match exactly ***/
              /*** and drop variables not in Dtrans1 file ***/
     (drop= Lastname Firstname Address Town State Zip Birthdate
       rename=(ssn=tssn) ); /*** rename ssn to tssn so we can match these two files ***/
merge Cmaster1 (in=m) /*** we will use the IN method to determine where each record comes from ***/
      Dtrans1  (in=t rename=(tssn=ssn)); /*** rename tssn to ssn so we can match on common field ***/
by ssn;
if m and     t then output Ematch;  /*** file of matched records ***/
                                    /*** using IN method to determine if record comes from
                                         both files, values for M and T will be one
                                         when record comes from both files - a match ***/
if t and not m then output Funmatch; /*** file of unmatched DTRANS1 records ***/
                                     /*** output only those records that come from DTRANS1 and
                                          and DO NOT come from Cmaster1
                                          values for T will be one and
                                          M will be zero but adding "not" will reverse it to
                                          one, thus selecting records from t (Dtrans1) only
                                          and not Master.                                   ***/
run;

%let step = %eval( &step + 1 );
Title1 "&Step - EMATCH File Records";
Proc Print data=EMATCH; run;

%let step = %eval( &step + 1 );
Title1 "&Step - FUNMATCH File Records";
Proc Print data=FUNMATCH; run;

/*** Now let us see how close SSN are to each other           ***/
/*** for unmatched records between Cmaster1 and Ctrans1       ***/
/*** Join records where SSN Not Equal to TSSN                 ***/
/*** this will create a Cartesian product of all combinations ***/
proc sql;
 create table Gallcombo as
 select *
  from Cmaster1  as m,
       Funmatch  as t
  where m.ssn ne t.tssn;
quit;

 /*** let us look at the spedis values ***/
data Hspedis_value;
 set Gallcombo;
 spedis_value = spedis(ssn, tssn); /*** cost to convert tssn to ssn ***/
run;

%let step = %eval( &step + 1 );
Title1 "&Step - HSPEDIS_VALUE File Records";
Proc Print data=HSPEDIS_VALUE; run;

/*** select only records that meet our spedis cost criteria ***/
data Ipossible_match (keep=ssn tssn);
 set Hspedis_value;
 if spedis_value le &spedis; /*** How much we are willing to pay for "closeness" ***/
run;

%let step = %eval( &step + 1 );
Title1 "&Step - IPOSSIBLE_MATCH File Records";
Proc Print data=IPOSSIBLE_MATCH; run;

/*** now get these possible matches from both Cmaster1 and Funmatch to compare records ***/
/*** At this point we have only assumed we have SSN to match on without any other data ***/
/*** to compare "likeness" to each other ***/

proc sql;
 create table Jpossible_matches as
  select *
    from Cmaster1 as m,
         Funmatch as u,
         Ipossible_match as p
    where (m.ssn eq p.ssn)
      and (u.tssn eq p.tssn) ;
 quit;

/*** used for proc forms - next step - comestic only           ***/
/*** makes for easier comparisons of JPossible_Matches Records ***/
data Jpossible_matches;
 set Jpossible_matches;
 separator  = ' | ';
 lssn       = "SSN        = ";
 llastname  = "Last Name  = ";
 lfirstname = "First Name = ";
 laddress   = "Address    = ";
 lTown      = "Town       = ";
 lstate     = "State      = ";
 lzip       = "Zip        = ";
 lbirthdate = "Birth Date = ";
run;

%let step = %eval( &step + 1 );
Title1 "&Step - JPOSSIBLE_MATCHES File Records";

 /*** at this point you need to MANUALLY review the possible_matches
      for potential match ***/
 /*** Proc Forms not available in SAS LE - Learning Edition ***/
 proc forms data=Jpossible_matches width=60 lines=8 between=2 pagesize=60;
 Title2 "Master is Left of | and Trans is right of |";
 /*** everything to the left "|" is AMaster Data,
                 to the right of "|" is TRANS Data ***/
 line 1 lssn        SSN        separator tSSN          ;
 line 2 llastname   Lastname   separator tLastname     ;
 line 3 lfirstname  Firstname  separator tFirstname    ;
 line 4 laddress    Address    separator tAddress      ;
 line 5 lTown       Town       separator tTown         ;
 line 6 lstate      State      separator tState        ;
 line 7 lzip        Zip        separator tZip          ;
 line 8 lbirthdate  Birthdate  separator tBirthdate    ;
 format _character_ $20. ;
 format separator $3. ;
 format birthdate tbirthdate worddate20.; /*** format used only to make output line up ***/
 format lssn -- lbirthdate  $13. ; /*** record-field labels ***/
 run;
 title2 " ";

 /*** use this instead of Proc Forms for SAS LE version ***/
Data _null_;
 file print;
 set Jpossible_matches;
 if _n_ = 1 then
 put @11 "Master Records contents| Trans Records contents";
 put lssn         $13.   SSN        $20.        separator $3.  tSSN       $20. /
     llastname    $13.   Lastname   $20.        separator $3.  tLastname  $20. /
     lfirstname   $13.   Firstname  $20.        separator $3.  tFirstname $20. /
     laddress     $13.   Address    $20.        separator $3.  tAddress   $20. /
     lTown        $13.   Town       $20.        separator $3.  tTown      $20. /
     lstate       $13.   State      $20.        separator $3.  tState     $20. /
     lzip         $13.   Zip        $20.        separator $3.  tZip       $20. /
     lbirthdate   $13.   Birthdate  worddate20. separator $3.  tBirthdate worddate20. ///;
 run;






 /*** now let us assume we have last names to do some further comparisons on ***/
 /*** let us look at spedis and soundex function values.                     ***/
 /***  we are continuing with the results from Jpossible_matches             ***/

 /*** The more data fields you have to compare values on, the higher    ***/
 /*** probablity you will have to determine if you have a correct match ***/
 /*** on a given individual - but you could also have more conflicting  ***/
 /*** data values which will create more need for logic determination   ***/
 /*** and judgment calls as to which file has "better quality" data     ***/


 /*** Lets try using the Soundex and Spedis Functions to see
      if Last Names are Close ***/
 data Ksoundex;
  /*** variable to determine if we have match on Last Name ***/
  label match_lname ="Match Lname";
  set Jpossible_matches (drop=lssn--lbirthdate separator);
  match_lname = 0; /*** set to 0 as default ***/

   /*** get soundex value but I will ignore 1st char later of result ***/
   lname_sound = soundex( lastname );
  tlname_sound = soundex( tlastname);
   /*** get spedis cost for Lastname to TLastname ***/
   spedis_lname = spedis( lastname, tlastname );
   /*** The chances of having a close SSN from Jpossible_matches
        and a close match on last name is quite small - acceptable ***/
   /*** we have a strong likely match based upon spedis value on lastname ***/
   if spedis_lname = 0 then match_lname = 1; /*** lastname equals or very close - must be match ***/
   if 0 lt spedis_lname le &spedis then do;
      /*** if last name is within our COST SPEDIS Limits, the chances of having a
           close SSN and similiar sounding lastname is within acceptable tolerance
           for a false positive hit ***/
      /*** check to see if soundex value is embedded in either value -
           that is why double comparison ***/
      if (0 ne index( trim(substr(lname_sound ,2 )) , trim(substr(tlname_sound,2 )) ) )
      or (0 ne index( trim(substr(tlname_sound,2 )) , trim(substr( lname_sound,2 )) ) )
      then match_lname = 1 ; /*** we have a strong likely match based upon soundex ***/
   end;

 /***
      An example of where Spedis Value could cause you to fail when you
      would like to get a hit but Soundex could help

     data test;
      lastname = "Patrick     "; tlastname = "Patridge  "; output;
      lastname = "Knight      "; tlastname = "Night     "; output;
      lastname = "Paridge     "; tlastname = "Partridge "; output;
     run;

      data soundex;
       set test;
      spedis1 = spedis( lastname, tlastname);
      spedis2 = spedis(tlastname,  lastname);
      soundex_L = soundex( lastname );
      soundex_L = soundex(tlastname );
      run;

      lastname    tlastname    spedis1   spedis2    soundex_L  soundex_T
                               lastname  tlastname  lastname   tlastname
                               tlstname  lastname
    1 Patrick     Patridge     35        29         P362       P3632
    2 Knight      Night        50        50         K523       N23
    3 Paridge     Partridge    14        22         P632       P63632

      record 1 - both would fail unless you increase spedis value but
                 you probably would get a false positive hit

      record 2 - higher spedis value but Soundex gives a reasonable likeness,
                 hence you probably might want this match to occur

      record 3 - a good spedis value but Soundex would fail using my assumption
                 in the above example

    Hence, here is where you would need to try various logical conditional
    statements to get what you desire for results using SOUNDEX and/or
    Spedis values. Also, you will see that you get different SPEDIS values
    depending on your useage of the SPEDIS arguments.
 ***/
 run;


%let step = %eval( &step + 1 );
Title1 "&Step - KSOUNDEX File Records";
Proc Print data=KSOUNDEX; run;


/*** Now let us select only those records which have been determined to match from previous step ***/
/*** and we will assume our AMASTER file has the BEST QUALITY data so we will not use any data   ***/
/*** from TRANS except for the INCOME variable                                                   ***/
data Lmatch_final;
 set Ksoundex (where = (match_lname = 1) )  /*** possible matches ***/
     Ematch                                 /*** exact matches    ***/
 ;
 keep ssn tssn income;
run;

%let step = %eval( &step + 1 );
Title1 "&Step - LMATCH_FINAL File Records";
Proc Print data=LMATCH_FINAL ; run;

/*** you will see we have two records for SSN = 023333207     ***/
/*** so, we will sort and keep only the highest income record ***/
/*** for each record - you may want to do something else      ***/

Proc sort data=Lmatch_final out=Lmatch_final; by ssn income; run;

data Mmatch_final_nodups;
 set Lmatch_final;
 by ssn income;
 /*** using the BY statement, you will have access to two special logicals -
      "first." and "last." - strongly recommended you become familiar with these -
      very helpful and necessary in your SAS developer toolset ***/

 if last.ssn; /*** select only last record (highest income) for each SSN ***/
 ssn  = put(input( ssn,9.) , ssn11.); /*** put  SSN back into its normal form as in AMASTER File ***/
 tssn = put(input(tssn,9.) , ssn11.); /*** put TSSN back into its normal form as in BTRANS File ***/
run;

%let step = %eval( &step + 1 );
Title1 "&Step - MMATCH_FINAL_NODUPS File Records";
Proc Print data=MMATCH_FINAL_NODUPS ; run;

/*** now back up AMaster file - just in case you need to restore it ***/
data Nmaster_backup; set Amaster; run;

%let step = %eval( &step + 1 );
Title1 "&Step - Nmaster_backup File Records";
Proc Print data=Nmaster_backup ; run;

proc sort data=Amaster out=Omaster; by ssn; run; /*** sort Amaster in SSN order for updating ***/

%let step = %eval( &step + 1 );
Title1 "&Step - OMASTER File Records - Before Update";
Proc Print data=OMASTER ; run;

data Omaster;
 update Omaster Mmatch_final_nodups (drop=tssn);
 by ssn;
 /*** also need to understand how UPDATE works - read up on this ***/
run;

%let step = %eval( &step + 1 );
Title1 "&Step - OMASTER File Records - After Update";
Proc Print data=OMASTER ; run;
/*** compare the last 2 prints to see which records were updated ***/

/*** now let us see which BTRANS records never matched                  ***/
/*** first get all possible SSNs that matched or were close to matching ***/
/*** this means getting both SSN and TSSN and creating multiple records ***/
/*** from 1 record                                                      ***/

data Pmatched_ssns (keep=tmpssn);
 set Lmatch_final;
     tmpssn = ssn;
     if tmpssn ne ' ' then do;
       tmpssn = put(input(tmpssn, 9.), ssn11. );
        output;
     end;
     tmpssn =tssn;
     if tmpssn ne ' ' then do;
       tmpssn = put(input(tmpssn, 9.), ssn11. );
        output;
     end;
run;

%let step = %eval( &step + 1 );
Title1 "&Step - PMATCHED_SSNS File Records";
Proc Print data=PMATCHED_SSNS ; run;

proc sort data=Pmatched_ssns out=Pmatched_ssns nodupkey; by tmpssn; run;
proc sort data=BTRANS        out=BTRANS                ; by tssn  ; run;

/*** now find BTRANS records that did not match or closely match         ***/
/*** these records will need to be reviewed as to why they did not match ***/
/*** this depends on how, why and where TRANS was created for your needs ***/
data Qtrans_nomatch;
 merge Btrans (in=T)
       Pmatched_ssns (in=M rename=(tmpssn=tssn));
 by tssn;
 if T and not M;
run;

%let step = %eval( &step + 1 );
Title1 "&Step - QTRANS_NOMATCH File Records";
Proc Print data=QTRANS_NOMATCH ; run;

/*** you will need to decide what to do with these records from BTRANS that
     did not match any records in Amaster file ***/

ods pdf close;


/*** Modified Solution by Arthur Tabachneck ***/ /*** Another method to compare all BTRANS tssn with Amaster SSN - Using Arrays and data step statements and no SQL ***/ /*** get number of records with Amaster dataset - little known tip ***/ data _null_; if 0 then set Amaster nobs=nobs; /*** no need to read dataset - just metadata ***/ CALL SYMPUT('NUMREC',nobs); /*** put # of records into NUMREC macro var ***/ stop; /*** stop, got number of records ***/ run; /*** now read master ssn into an array, and then read Btrans tssns and compare all combinations ***/ data matches; retain closest lowscore; /*** retain lowest score ***/ label assn="master ssn"; /*** a way to order ASSN to appear near top of Var List for easy comparison of TSSN and ASSN, as well as label the variable ASSN ***/ array Amaster(&numrec) $9; /*** creat an array with same number of elements as there are records in amaster ***/ i=0; do until (eof1); /*** load the array with master ssn ***/ set Amaster end=eof1; i=i+1; Amaster(i)=SSN; end; /*** now read Btrans record by record ***/ do until (eof2); set Btrans end=eof2; closest=0; lowscore=100; /*** set maximum lowscore to 100 (arbitrary value) as default ***/ do i=1 to &numrec; /*** each Btrans record with each ssn in the Amaster array ***/ /*** now determine spedis value ***/ score=spedis(tSSN,Amaster(i)); if score le lowscore then do; /*** if spedis value le 100 - reset lowscore to spedis value ***/ lowscore=score; closest=i; /*** keep array element index to know which one is closest ***/ end; end; if lowscore le 20 then do; /*** a score of 20 is determined to be acceptable ***/ ASSN=Amaster(closest); /*** get Amaster SSN which is closest to Tssn ***/ output; /*** now output TSSN and closest ASSN with other data ***/ end; end; keep lowscore tSSN ASSN tLastname tFirstname tAddress tTown tState tZip tBirthdate Income; run;
/*** end of tip 00392 ***/