/****************************************************************************/
/*** TIP 00128F -A variation of TIP 00128A but instead of using arrays to ***/
/*** cleanse various SAS variables, this will attempt to use ***/
/*** formats to cleanse the data - much faster than arrays ***/
/*** When scrubbing large files this should be as fast as ***/
/*** possible, and only need to pass through the file once. ***/
/*** Also, can scrub multiple fields during the same pass of ***/
/*** a data read. ***/
/*** ***/
/*** The Formats are tricky in that CASE sensitive words ***/
/*** need to be accounted for all possibilties. ***/
/*** ***/
/*** And when trying to scrub phrases of words, you will need ***/
/*** to call the macro cleanse with parse=noparse ***/
/*** look at illnesses to see example of using phrases. ***/
/*** ***/
/*** ***/
/*** Also incorporated cleaning groups of words before doing ***/
/*** the words separately, ie "THE NEW YORK GROUP" gets ***/
/*** modified to "THE NY GROUP". This is done before doing ***/
/*** each word separately. ***/
/*** ***/
/*** Otherwise, see address cleanse example to see how to ***/
/*** scrub each word of a sas variable. ***/
/*** ***/
/*** I am converting all fields to UPPERCASE to make the ***/
/*** scrubbing easier and consistent to ignore mixed case ***/
/*** ***/
/*** Author: Charles Patridge ***/
/*** Tested under Windows XP and Version SAS 8.2 ***/
/*** ***/
/****************************************************************************/
%let strlen = 50 ; /*** arbitary length of target and replacement strings ***/
/*** should be at least the same length sas fields you ***/
/*** are trying to scrub ***/
/*** you might wish to setup separate length macros ***/
/*** to handle varying widths for each field to scrub ***/
%let fmtlen = 40 ; /*** length for your formats to scrub the data ***/
/*** can not exceed max length of proc format limits ***/
%let _specchar_ = ' .!@#$%&*()-_{}[]|\/<>,'; /*** delimiters ***/
/*** these delimiters are used to separate words by ***/
/*** translating them to blanks and then using blanks ***/
/*** as the standard delimiter below ***/
%let _delim_ = ' '; /*** delimiters ***/
/*** Test Data ***/
data Mydata;
attrib illness length=$50 label="Describe Illness";
attrib fname length=$50 label="First Name";
attrib lname length=$50 label="Last Name";
attrib address length=$50 label="Street" ;
attrib city length=$50 label="City" ;
attrib state length=$50 label="State" ;
attrib zipcode length=$10 label="Zip Code" ;
infile cards delimiter=',' missover;
input fname lname illness address city state zipcode;
/*** setup std_addr for comparisons ***/
/*** translate various delimiters to blanks ***/
std_addr = translate( address, ' ', &_specchar_ );
std_addr = %cmpres( std_addr ); /*** eliminate multiple consecutive blanks ***/
std_fname = translate( fname, ' ', &_specchar_ );
std_fname = %cmpres( std_fname ); /*** eliminate multiple consecutive blanks ***/
std_fname = upcase( std_fname ); /*** upcase to ignore case ***/
/*** setup std_illness for comparisons ***/
/*** translate various delimiters to blanks ***/
std_illness = translate( illness, ' ', &_specchar_ );
std_illness = %cmpres( std_illness ); /*** eliminate multiple consecutive blanks ***/
std_illness = upcase( std_illness ); /*** upcase to ignore case sensitivity ***/
output;
return;
eof:
/*** test for hex codes ***/
fname = 'Chas';
lname = '00'x || ' ' || 'Patridge IV';
illness='my head throbs';
address = 'PO BOX 172';
city = 'Unionville';
state = 'CT';
zipcode = '06013';
output;
return;
cards;
The New York Dentists, Lastname, Headaches, 172 Burlington Ave, Burlington, CT, 06085
Chucky,Mr Patridge III, aching head, 172 Monce Road,Burlington,CT,06013
Charles,Mr Patridge III, aching head, 172 Monce Road,Burlington,CT,06013
Charlie,Mister Patridge SR, throbbing head, 172 Monce Road West,Burlington,CT,06013
Chasity,Miss Patridge ATTNY, head hurts , 172 Monce Boulevard,Burlington,Conn,06013
Chasity,Ms Patridge ATTNY, head hurts , 172 Monce Boulevard,Burlington,Conn,06013
Chuck,Dr Patridge, head aches, BOX 172,Burlington,Connecticut,06013
Chuckie, Patridge, head throbs, PO Box 172,Burlington,Ct,06013
Chas, Partridge, headache, 172 East Monce Road,Burlington,Vermont,06013
;;;;
run;
/*** Data set to Clean Address ***/
data clnaddr;
retain fmtname '$stdaddr';
length start $&strlen label $ &strlen;
infile cards delimiter=',' missover;
input start label;
/*** upcase variables to ignore case sensitivity ***/
start = upcase(start);
label = upcase(label);
cards;
Avenues,Ave
Avenue,Ave
Boulevard,Blvd
Circle,Cir
Crossing,Xing
Drawer,Box
Draw ,Box
Drive,Dr
East,E
Floor,Fl
Fort,Ft
Heights,Hgt
Height,Hgt
Highway,Hwy
Lane,Ln
North,N
Park,Pk
Post Office Box,Box
P O Box,Box
PO Box,Box
POBox,Box
Pointe,Pt
Point,Pt
Road,Rd
Route,Rt
South,S
Square,Sq
Street,St
Terrace,Tr
Turnpike,Tpk
West,W
;;;;
run;
data clnaddr;
set clnaddr end=eof;
if start = ' ' then delete; /*** delete any blank codes ***/
output;
if eof then do; /*** need to force length of format to be certain length ***/
start = repeat( 'Z', &fmtlen );
label = start;
output;
end;
run;
/*** create format to cleaning address ***/
proc sort data=clnaddr out=clnaddr nodupkey; by start; run;
proc format cntlin=clnaddr; run;
/*** Data set to Clean Illness Description ***/
/*** will need to use phrases to code so noparse will be 1 macro options ***/
data clnilln;
retain fmtname "$stdill";
length start $&strlen label $ &strlen;
infile cards delimiter=',' missover;
input start label;
/*** upcase variables to ignore case sensitivity ***/
start = upcase(start);
label = upcase(label);
cards;
head aching,headache
head aches,headache
head hurts,headache
head hurt ,headache
aching head,headache
head throbs,headache
head throbbing,headache
throbbing head,headache
my head throbs,headache
;;;;
run;
data clnilln;
set clnilln end=eof;
if start = ' ' then delete;
output;
if eof then do;
start = repeat( 'Z', &fmtlen );
label = start;
output;
end;
run;
/*** create format to clean illness ***/
proc sort data=clnilln out=clnilln nodupkey; by start; run;
proc format cntlin=clnilln; run;
/*** Data set to Clean Groups of Words within string fields ***/
/*** this will be used before doing each word separately to ***/
/*** ensure the words within a group do not get changed by ***/
/*** doing them separately and getting modified ***/
data clngroup;
retain fmtname "$stdgrp";
length start $&strlen label $ &strlen;
infile cards delimiter=',' missover;
input start label;
/*** upcase variables to ignore case sensitivity ***/
start = upcase(start);
label = upcase(label);
cards;
New York,NY
New Jersey,NJ
New Mexico,NM
North Dakota,ND
North Carolina,NC
South Dakota,SD
South Carolina,SC
;;;;
run;
data clngroup;
set clngroup end=eof;
if start = ' ' then delete;
output;
if eof then do;
start = repeat( 'Z', &fmtlen );
label = repeat( 'Z', &fmtlen );
output;
end;
run;
/*** create format to clean illness ***/
proc sort data=clngroup out=clngroup nodupkey; by start; run;
proc format cntlin=clngroup; run;
/*** macro used within cleanse macro below - to join scrubbed words
back into a single sas variable ***/
%macro looptrim(_numwords_);
%do _ww_ = 1 %to &_numwords_;
trim(_&_pass.a_(&_ww_)) || ' ' ||
%end;
%mend looptrim;
/*** EMPTYYN Macro to get NUMBER OF OBS ***/
%GLOBAL EMPTYYN NUMOBS DSN;
%MACRO EMPTYYN(DSNAME=&syslast);
%let dsn = &dsname;
DATA _NULL_;
IF 0 THEN SET &DSNAME NOBS=NUMOBS;
IF NUMOBS > 0 THEN EMPTYYN = 'N';
ELSE EMPTYYN = 'Y';
CALL SYMPUT('EMPTYYN',PUT(EMPTYYN, $1.));
CALL SYMPUT('NUMOBS' ,PUT(NUMOBS , BEST.));
STOP;
RUN;
%MEND EMPTYYN;
%emptyyn(DSNAME=work.clngroup);
%let _numgrp_ = &numobs;
%macro cleanse(strlen=50, var=, stdfmt=, case=nocase, _wordlmt_=10, parse=parse);
/*****************************************************************/
/*** Author: Charles Patridge ***/
/*** Email: Charles_S_Patridge@prodigy.net ***/
/*** Version: 1.0 variation of tip00128a - using formats ***/
/*** Date: Oct. 04, 2004 ***/
/*** Tested and created in SAS V8.2 - Windows ***/
/*** ***/
/*** 5 Parameters for this Macro ***/
/*** strlen = length of character variable Start and ***/
/*** Label ***/
/*** ***/
/*** var = SAS variable name to be cleansed ***/
/*** ***/
/*** stdfmt = Format to use to standardize field ***/
/*** include the length of format ***/
/*** ex: $stdaddr30. ***/
/*** ***/
/*** case = Modify output to LOWCASE, UPCASE or NOCASE ***/
/*** LOWCASE will convert output to Lower Case ***/
/*** UPCASE will convert output to Upper Case ***/
/*** NOCASE will leave output as same as input ***/
/*** but it will not convert if not exact match ***/
/*** This only affects the SAS var you are ***/
/*** trying to cleanse, and does not affect any ***/
/*** other SAS variables in your dataset. ***/
/*** ***/
/*** _wordlmt_= Limit number of words a variable can contain ***/
/*** ***/
/*** parse = parse - separate var into separate words ***/
/*** for scrubbing purposes ***/
/*** noparse - keep as a phrase of words to match ***/
/*** for scrubbing purposes ***/
/*****************************************************************/
%let parse = %upcase( &parse );
if _n_ = 1 then do;
%let _pass = &sysindex; /*** sas system macro ***/
/*** _pass = used to create separate array statements for ***/
/*** each SAS variable to be cleansed ***/
%let case = %upcase(&case); /*** up case CASE parameter ***/
%if &case = NOCASE %then %let case = ; /*** set to blank if nocase ***/
/*** setup an array to hold abbrievated words ***/
array _&_pass.a_ (&_wordlmt_) $ &strlen _temporary_; /*** # abbr words Array ***/
array _&_pass.g_ (&_numgrp_ ) $ &strlen _temporary_; /*** # abbr groups Array ***/
do _k_ = 1 to &_numgrp_;
_recd = _k_;
set clngroup point=_recd;
_&_pass.g_(_k_) = &case(start);
end;
drop _words_ _w_ _k_ start label fmtname;
end;
/*** change case for variable to be cleansed if requested ***/
&var = &case( &var );
/*** abbr groups of words first before trying each word separately ***/
/*** clean up groups of words first ***/
do _w_ = 1 to &_numgrp_;
&var = tranwrd(&var, trim(_&_pass.g_(_w_)), trim(put(trim(_&_pass.g_(_w_)),$stdgrp.)));
end;
/*** determine how many words in variable ***/
_words_ = 1+ (length(trim(&var))
- length(trim(compress(&var, &_delim_))));
/*** get minimum words of limit or what are actual number of words ***/
_words_ = min( &_wordlmt_, _words_);
/*** initialize array with blanks ***/
do _w_ = 1 to dim(_&_pass.a_);
_&_pass.a_(_w_) = ' ';
end;
/*** put variable initial separate words into array and scrub using format ***/
%if &parse = PARSE %then %do;
do _w_ = 1 to _words_;
_&_pass.a_(_w_) = put(trim(left(scan( &var, _w_, &_delim_))), &stdfmt );;
end;
%end;
/*** when you need to map a phrase of words use noparse ***/
/*** put variable initial phrase of words into array and scrub using format ***/
%if &parse ne PARSE %then %do;
_&_pass.a_(1) = put(trim(left( &var)), &stdfmt );;
%end;
&var = %looptrim( &_wordlmt_ ) ' ' ;;; /*** limited # separate words ***/
&var = left(&var); /*** left justify variable to be cleansed ***/
%mend cleanse;
data clean;
set mydata;
/*** make sure format length used do not exceed maximum allowed by format ***/
%cleanse(var=std_fname , stdfmt=$stdgrp&fmtlen.. , case=UPCASE, _wordlmt_=10 );
%cleanse(var=std_addr , stdfmt=$stdaddr&fmtlen.., case=UPCASE, _wordlmt_=10 );
%cleanse(var=std_illness , stdfmt=$stdill&fmtlen.. , case=UPCASE, _wordlmt_=10, parse=noparse );
run;