/*** Tip 00128a ***/

 %let strlen = 50 ; /*** arbitary length of target and replacement strings ***/
 %let _delim_ = ' ,./\()*-_+=:;<>|{}[]'; /*** delimiters ***/

  /*** Test Data ***/
  data Mydata;
   attrib illness length=$50  label="Describe Illness";
   attrib fname   length=$50  label="First Name";
   attrib lname   length=$50  label="Last  Name";
   attrib address length=$50  label="Street"    ;
   attrib city    length=$50  label="City"      ;
   attrib state   length=$50  label="State"     ;
   attrib zipcode length=$10  label="Zip Code"  ;

   infile cards delimiter=',' missover eof=eof;
   input fname lname illness address city state zipcode;
   output;
   return;
   eof:
     /*** test for hex codes ***/
     fname = 'Chas';
     lname = '00'x || ' ' || 'Patridge IV';
     illness='my head throbs';
     address = 'PO BOX 172';
     city = 'Unionville';
     state = 'CT';
     zipcode = '06013';
     output;
  return;
  cards;
  Charles,Mr Patridge III, aching head, 172 Monce Road,Burlington,CT,06013
  Charlie,Mister Patridge SR, throbbing head, 172 Monce Road West,Burlington,CT,06013
  Chasity,Miss Patridge ATTNY, head hurts , 172 Monce Boulevard,Burlington,Conn,06013
  Chasity,Ms Patridge ATTNY, head hurts , 172 Monce Boulevard,Burlington,Conn,06013
  Chuck,Dr Patridge, head aches, BOX 172,Burlington,Connecticut,06013
  Chuckie, Patridge, head throbs, PO Box 172,Burlington,Ct,06013
  Chas, Partridge, headache, 172 East Monce Road,Burlington,Vermont,06013
  ;;;;
 run;

  /*** Data set to Clean Illness Description ***/
  data clnilln;
   length target $&strlen replace $ &strlen;
   infile cards delimiter=',' missover;
   input target replace;
  cards;
  head aching,headache
  head aches,headache
  head hurts,headache
  head hurt ,headache
  aching head,headache
  head throbs,headache
  head throbbing,headache
  my head throbs,headache
  ;;;;
 run;

  /*** Data set to Clean High Bit Codes ***/
  /*** will only work if high bit codes separated by a delimiter ***/
  data clnhghbt;
  length target $&strlen replace $ &strlen;
  target = '00'x ; replace = ' ' ; output;
  target = '01'x ; replace = ' ' ; output;
  target = '09'x ; replace = ' ' ; output;
 run;

  /*** Data set to Clean Surnames Suffix or Prefix ***/
  data clnsuffx;
   length target $&strlen replace $ &strlen;
   infile cards delimiter=',' missover;
   input target replace;
  cards;
  I,
  i,
  II,
  ii,
  III,
  iii,
  IV,
  iv,
  JUNIOR,
  Junior,
  junior,
  JR,
  jr,
  Senior,
  SENIOR,
  SR,
  sr,
  PHD,
  Phd,
  PhD,
  phd,
  Attorney at Law,
  ATTORNEY AT LAW,
  attorney at law,
  Attorney,
  ATTORNEY,
  Attny,
  ATTNY,
  attny,
  DDS,
  Doctor,
  DOCTOR,
  DR,
  Dr,
  dr,
  Mister,
  Mr,
  MR,
  Ms,
  MS,
  Miss,
  MISS,
  ;;;;
 run;

  /*** Data set to Clean First Name ***/
  data clnfname;
   length target $&strlen replace $ &strlen;
   infile cards delimiter=',' missover;
   input target replace;
  cards;
  Chas,Charles
  CHUCK,Charles
  Chuckie,Charles
  Charlie,Charles
  Charley,Charles
  Charles,Chas
  ;;;;
 run;

  /*** Data set to Clean Address ***/
  data clnaddr;
   length target $&strlen replace $ &strlen;
   infile cards delimiter=',' missover;
   input target replace;
  cards;
Avenues,Ave
Avenue,Ave
Boulevard,Blvd
Circle,Cir
Crossing,Xing
Drawer,Box
Draw ,Box
Drive,Dr
East,E
Floor,Fl
Fort,Ft
Heights,Hgt
Height,Hgt
Highway,Hwy
Lane,Ln
North,N
Park,Pk
Post Office Box,Box
P O Box,Box
PO Box,Box
POBox,Box
Pointe,Pt
Point,Pt
Road,Rd
Route,Rt
South,S
Square,Sq
Street,St
Terrace,Tr
Turnpike,Tpk
West,W
;;;;
run;

  /*** Data set to Clean State   ***/
  data clnstate;
   length target $&strlen replace $ &strlen;
   infile cards delimiter=',' missover;
   input target replace;
  cards;
Alaska, AK
Alabama, AL
Arizona, AZ
Arkansas, AR
Calif, CA
California, CA
Colorado, CO
Conn, CT
Connecticut, CT
District of Columbia, DC
Dist of Columbia, DC
Fla, FL
Florida, FL
Georgia, GA
Hawaii, HI
Idaho, ID
Ill, IL
Illinois, IL
Ind, IN
Indiana, IN
Iowa, IA
Kansas, KS
Kentucky, KY
Louisana, LA
Maine, ME
Mass, MA
Massachusetts, MA
Mich, MI
Michigan, MI
Minn, MN
Minnesota, MN
Miss, MS
Mississippi, MS
Missouri, MO
Montana, MT
Nebraska, NE
Nevada, NV
NewJersey, NJ
New Jersey, NJ
NewMexico, NM
New Mexico, NM
NewYork, NY
NewYorkCity, NY
New York City, NY
New York, NY
NorthCarolina, NC
North Carolina, NC
N Carolina, NC
NorthDakota, ND
North Dakota, ND
N Dakota, ND
Ohio, OH
Oklahoma, OK
Oregon, OR
Ore, OR
Penn, PA
Pennsylvania, PA
RhodeIsland, RI
Rhode Island, RI
SouthCarolina, SC
South Carolina, SC
S Carolina, SC
SouthDakota, SD
South Dakota, SD
S Dakota, SD
Tenn, TN
Tennessee, TN
Texas, TX
Utah, UT
Vermont, VT
Virginia, VA
Wash, WA
Washington, WA
WestVirginia, WV
West Virginia, WV
W Virginia, WV
Wisconsin, WI
Wisc, WI
Wyoming, WY
;;;;
run;

 /*** EMPTYYN Macro to get NUMBER OF OBS ***/
  %GLOBAL EMPTYYN NUMOBS DSN;
  %MACRO EMPTYYN(DSNAME=&syslast);
    %let dsn = &dsname;
    DATA _NULL_;
     IF 0 THEN SET &DSNAME NOBS=NUMOBS;
     IF NUMOBS > 0 THEN EMPTYYN = 'N';
                   ELSE EMPTYYN = 'Y';
     CALL SYMPUT('EMPTYYN',PUT(EMPTYYN, $1.));
     CALL SYMPUT('NUMOBS' ,PUT(NUMOBS , BEST.));
     STOP;
    RUN;
  %MEND EMPTYYN;

  %macro cleanse(schlib=work, schema=, strlen=50,var=,target=target,replace=replace,case=nocase);
   /*****************************************************************/
   /*** Author: Charles Patridge                                  ***/
   /***  Email: Charles_S_Patridge@prodigy.net                    ***/
   /***  Version: 2.1 (suggestions by Ian Whitlock)               ***/
   /***  Date: Dec. 03, 2001                                      ***/
   /***  Tested and created in SAS V6.12 - Alpha OpenVMS          ***/
   /***                                                           ***/
   /***  7 Parameters for this Macro                              ***/
   /***   schlib   = SAS Library Holding Schema File              ***/
   /***   schema   = SAS Schema/Cleansing File                    ***/
   /***   strlen   = length of character variable Target and      ***/
   /***              Replace                                      ***/
   /***   var      = SAS variable name to be cleansed             ***/
   /***   target   = Target Var   String to be Cleansed           ***/
   /***               Schema Var Name (ie Target)                 ***/
   /***   replace  = Replacement Var  String after Cleansing      ***/
   /***               Schema Var Name (ie Replace)                ***/
   /***   case     = Modify output to LOWCASE, UPCASE or NOCASE   ***/
   /***              LOWCASE will convert output to Lower Case    ***/
   /***              UPCASE  will convert output to Upper Case    ***/
   /***              NOCASE  will leave output as same as input   ***/
   /***               but it will not convert if not exact match  ***/
   /***              This only affects the SAS var you are        ***/
   /***              trying to cleanse, and does not affect any   ***/
   /***              other SAS variables in your dataset.         ***/
   /***                                                           ***/
   /*** WARNING!!!                                                ***/
   /*** If Replacement String is Longer than Target String, data  ***/
   /*** could become truncated in the output dataset.             ***/
   /***                                                           ***/
   /*** added logic so if TARGET is just one word it must match   ***/
   /*** otherwise Chasity would become Charlesity as an example   ***/
   /*****************************************************************/
    %let _pass = &sysindex;
   /***  _pass    = used to create separate array statements for  ***/
   /***             each schema used                              ***/
    %let case = %upcase(&case); /*** up case CASE parameter ***/
    %if &case = NOCASE %then %let case = ; /*** set to blank if nocase ***/

   /*** need to create separate array statements for each schema applied ***/
     array _&_pass.t_ (&&&schema) $ &strlen _temporary_; /*** Target  Array ***/
     array _&_pass.r_ (&&&schema) $ &strlen _temporary_; /*** Replace Array ***/
     array _&_pass.w_ (&&&schema)           _temporary_; /*** # words Array ***/

     drop _k_ &target &replace _words _w_  ;
     /*** get the Schema Array and Retain Them ***/
     if _n_ = 1 then do;
        do _k_ = 1 to &&&schema;
           _recd = _k_;
           set &schlib..&schema point=_recd;
           _&_pass.t_(_k_) = &case(&target);
           _&_pass.r_(_k_) = &case(&replace);
           _&_pass.w_(_k_) = _words;
        end;
     end;

     /*** change case for variable to be cleansed if requested ***/
     &var = &case( &var );

     /*** make the substitutions indicated by the schema ***/
     do _k_ = 1 to &&&schema;
         /*** using TRANWRD, cleanse the variable - var ***/
         /*** if target is more than 1 word just replace   ***/
       if _&_pass.w_(_k_) gt 1 then
          &var = tranwrd(&var, trim(_&_pass.t_(_k_)), trim(_&_pass.r_(_k_)) );
       else
       do; /*** Logic if TARGET is just ONE word ***/
         _words = 1+ (length(trim(&var))
                    - length(trim(compress(&var, &_delim_))));
         do _w_ = 1 to _words;
         /*** using TRANWRD, cleanse the variable - _mystr ***/
         /*** if target eq 1 word then string must equal   ***/
            if trim(_&_pass.t_(_k_)) = scan( &var, _w_, &_delim_) then
               &var = tranwrd(&var, trim(_&_pass.t_(_k_)), trim(_&_pass.r_(_k_)));
         end;
       end;
     end;

     &var = left(&var); /*** left justify variable to be cleansed ***/
  %mend cleanse;

  %macro chkschem (schlib=work, schema=, target=target, replace=replace);
    %global &schema  ; /*** macro var for # of schema records  ***/
   /*** check for cyclical mappings within the schema file       ***/
   proc sql;
    create table _check_ as
      select distinct a.&target, a.&replace
      from &schlib..&schema as a,
           &schlib..&schema as b
      where b.&replace =a.⌖
   quit;

   %emptyyn ( dsname=&schlib..&schema );   /*** get number of schema records ***/
   %let &schema = &numobs; /*** set # of schema records      ***/

   %emptyyn ( dsname=work._check_ );
    /*** check to see if there is any cyclical mappings ***/
    %if &numobs ne 0 %then %do;
        title1 "&schlib..&schema file as Cyclical Mappings-Suggest You Correct";
        proc print data=_check_; run;
    %end;

    /*** delete _check_ file - no longer needed ***/
    proc datasets library=work nolist; delete _check_; quit;

   /*** need to sort in descending length of replacement string ***/
   /*** otherwise, you will get unexpected results              ***/
   data &schema;
    set &schlib..&schema;
    _len = length(trim(&target));
    _words = 1+ (length(trim(&target)) - length(trim(compress(&target, &_delim_))));
   run;

   proc sort data=&schema out=&schema (drop=_len); by descending _len; run;
 %mend chkschem;


  /*** need to check each schema for cyclical mappings  ***/
  /*** as well as create a global macro variable to get ***/
  /*** number of records for each schema used           ***/
  /*** CHECK OUTPUT for any cyclical mapping reports    ***/

  %chkschem( schema=clnfname);
  %chkschem( schema=clnsuffx);
  %chkschem( schema=clnhghbt);
  %chkschem( schema=clnilln );
  %chkschem( schema=clnaddr );
  %chkschem( schema=clnstate);

 data clean;
  set mydata;

        %cleanse(schema=clnfname ,var=fname   );
        %cleanse(schema=clnsuffx ,var=lname   );
        %cleanse(schema=clnhghbt ,var=lname   );
        %cleanse(schema=clnilln  ,var=illness );
        %cleanse(schema=clnaddr  ,var=address );
        %cleanse(schema=clnstate ,var=state  , case=UPCASE);

   /*** check to see if zip code and state are consistent ***/
   if zipstate( substr(zipcode,1,5) ) ne trim(state)
      then FLAG = "BAD Zip/State Combo";
 run;

/*** end of tip 00128a ***/