User:Statsrick/SAS code
Read in an AWS S3 directory and iterate over files |
---|
libname L "/home/ec2-user/SAS_DATA/";
%macro readin(s3dir=,outputdata=);
filename DIRLIST pipe "aws s3 ls &s3dir";
data dirlist;
infile DIRLIST lrecl=200 truncover;
input line $200.;
length file_name $ 200;
date=input(scan(line,1," "),yymmdd10.); format date yymmdd10.;
time=scan(line,2," ");
size=scan(line,3," ")*1;
file_name=scan(line,-1," ");
run;
proc print data=dirlist;
run;
data _null_;
set dirlist end=end;
count+1;
call symput('full'||left(count),left(trim(file_name)));
if end then call symput('nfiles',count);
run;
%do i=1 %to &nfiles;
%let temp=&&full&i;
data _null_;
call system("aws s3 cp --force &temp /home/ec2-user/CSV/temp.csv");
run;
proc import datafile="/home/ec2-user/CSV/temp.csv"
out=temp
dbms=csv
replace;
guessingrows=32767; /* get column widths right. default is 20 */
getnames=yes;
run;
data temp;
set temp;
*do stuff;
run;
%if &i=1 %then %do;
data &outputdata; set temp; run;
%end;
%if &i>1 %then %do;
proc append base=&outputdata data=temp; run;
%end;
%end; *i;
%mend readin;
%readin(s3dir=s3://hearstdataservices/,outputdata=L.hmg_yvar_content);
|
Macro to export a SAS dataset as a json file |
---|
%macro json(indata=,outfile=);
proc contents data=&indata noprint out=cont;
proc sort data=cont; by varnum;
data _null_;
set cont end=eof;
call symput(compress("var"||put(_n_,best10.)),trim(name));
if eof then do; call symput("nv",compress(put(_N_,best10.))); call symput("mv",compress(put(_N_-1,best10.))); end;
run;
data _null_;
file "&outfile" PS=32767;
set &indata end=lastrec;
if _N_ eq 1 then do;
put '[';
end;
put '{ "' @; put "%trim(&var1)"@; put '":"' &var1 '",';
%do i = 2 %to &mv;
put '"' @; put "%trim(&&var&i)"@; put '":"' &&var&i '",';
%end;
put '"' @; put "%trim(&&var&nv)"@; put '":"' &&var&nv '"}';
if lastrec eq 1 then do;
put ']';
end;
else do;
put ',';
end;
RUN;
%mend json;
%json(indata=amn.Contout,outfile=\\psf\Home\Desktop\test.json);
|
Read a flat file using PROC IMPORT and get the column widths correct |
---|
proc import datafile=C:\Dropbox\data.csv"
out=<output dataset>
dbms=csv
replace;
guessingrows=32767; /* default is 20 rows */
getnames=yes;
run;
|
Read a directory list into a data set |
---|
/* In Windows */
filename pipedir pipe "dir &location. /b " lrecl=32767;
data filenames;
infile pipedir truncover;
input line $char1000.;
run;
/* In Linux */
filename DIRLIST pipe '/home/ec2-user/s3cmd/s3cmd ls s3://hearstlogfiles/rawlogs/';
data dirlist;
infile dirlist lrecl=200 truncover;
input line $200.;
length file_name $ 200;
date=input(scan(line,1," "),yymmdd10.); format date yymmdd10.;
time=scan(line,2," ");
size=scan(line,3," ")*1;
file_name=scan(line,-1," ");
if index(lower(file_name),compress(lower("&root")))>0;
keep file_name;
run;
|
Fastest way to determine the number of unique levels all the variables in a data set has |
---|
ods output nlevels=levels;
proc freq data=L.Visit_20131114_sm nlevels;
tables _char_ /noprint;
quit;
|
How to determine first or last element corresponding with another ordered value using PROC SQL |
---|
SELECT
a.USER_ID
,min(b.ENTRY_DATE_TIME) format=DATETIME16. as FIRST_DTTIME
,input(substr(min(put(b.ENTRY_DATE_TIME,DATETIME16.)||put(b.client_id,best10.)),17),best12.) as FIRST_CLIENT
,input(substr(max(put(b.ENTRY_DATE_TIME,DATETIME16.)||put(b.client_id,best10.)),17),best12.) as LAST_
FROM ....
GROUP BY 1;
|
Create a Transition Matrix |
---|
/*******************************************/
/* Transition Table Macro */
/*******************************************/
/*--------------------------------------------------------------------------*/
/*
MACRO INPUTS
indata: Name of input dataset
outdata: Name of output dataset
state: The variable that is being transitioned
unit_var: Individual entity indentifier (e.g. account number)
date_var: The date variable that will be used to identify the From/To state
from: The 'From' date, e.g. '01Mar2010'd or 'Q1'
to: The 'To' date, e.g. '01Jun2010'd or 'Q1'
NOTES
The macro uses variables named 'Count' and 'Percentage', and there for variables of the same name cannot be entered as variables on the input data set. These are
left like this for reasons of clarity.
*/
/*--------------------------------------------------------------------------*/
%macro transition_table(indata,outdata,state_var,unit_var,date_var,from,to);
/*----------------------------------------------------------*/
/*Create template containing every possible state transition*/
/*----------------------------------------------------------*/
proc sql;
create table &outdata. as
select
a.&state_var as from_state
,b.&state_var as to_state
from
(select distinct &state_var from &indata) a
,(select distinct &state_var from &indata) b;
quit;
/*---------------------------------------------------------------------*/
/*Self join data so from and to state are on the same row and then group
the data so there is only one row for each distinct from/to state. Merge
the result to a template containing every possible state transition.
The grouped total of the 'From' states are also joined to allow for
percentages to be calculated*/
/*---------------------------------------------------------------------*/
proc sql;
create table &outdata. as
select
t.from_state
,t.to_state
,c.count
,c.percentage
from
&outdata. t
left join
(
select
a.&state_var as from_state
,b.&state_var as to_state
,count(*) as count
,count(*)/total as percentage
from
/*Create 'From' temporary table*/
(select * from &indata where date=&from) a
join
/*Create 'To' temporary table*/
(select * from &indata where date=&to) b
on a.account_id=b.account_id
left join
/*Create totals for each 'From' state*/
(select &state_var,count(*) as total from &indata where date=&from group by &state_var) totals
on a.&state_var=totals.&state_var
group by
a.&state_var
,b.&state_var
) c
on t.from_state=c.from_state and t.to_state=c.to_state;
quit;
/*---------------------------------------------------------------------*/
/*Transpose data so 'From' state is down the side, while 'To' state is
along the top. Contains percentage and volume matrix*/
/*---------------------------------------------------------------------*/
proc transpose data=&outdata. out=&outdata.;
by from_state;
id to_state;
var count percentage ;
run;
/*---------------------------------------------------------------------*/
/*Sort data so volume matrix is first and percentage matrix follows*/
/*---------------------------------------------------------------------*/
proc sort data =&outdata.;
by _name_;
run;
%mend;
/************************/
/* Sample Run */
/************************/
/*Create sample date*/
data sample_data;
input @1 date date9. @11 account_id $1. @13 state $1.;
format date date9.;
datalines;
01May2010 1 A
01May2010 2 B
01May2010 3 A
01May2010 4 A
01May2010 5 A
01May2010 6 C
01Jun2010 1 C
01Jun2010 2 B
01Jun2010 3 C
01Jun2010 4 C
01Jun2010 5 B
01Jun2010 6 A
;
run;
%transition_table(sample_data,outdata,state,account_id,date,"01May2010"d,"01Jun2010"d);
|
Cool code to map closest DMA using latitudes and longitudes |
---|
temp=MIN(
int(geodist(lat,lon,40.7143528,-74.0059731,'M')*1e6)+1000.501,
int(geodist(lat,lon,34.0522342,-118.2436849,'M')*1e6)+1000.803,
int(geodist(lat,lon,41.8781136,-87.6297982,'M')*1e6)+1000.602,
int(geodist(lat,lon,39.952335,-75.163789,'M')*1e6)+1000.504,
int(geodist(lat,lon,32.7801399,-96.8004511,'M')*1e6)+1000.623,
int(geodist(lat,lon,37.7731433,-122.4373674,'M')*1e6)+1000.807,
int(geodist(lat,lon,42.927909,-71.438502,'M')*1e6)+1000.506,
int(geodist(lat,lon,39.4689717,-77.5643496,'M')*1e6)+1000.511,
int(geodist(lat,lon,33.7489954,-84.3879824,'M')*1e6)+1000.524,
int(geodist(lat,lon,29.7601927,-95.3693896,'M')*1e6)+1000.618,
int(geodist(lat,lon,42.331427,-83.0457538,'M')*1e6)+1000.505,
int(geodist(lat,lon,47.44443,-122.300497,'M')*1e6)+1000.819,
int(geodist(lat,lon,34.5033259,-112.2358429,'M')*1e6)+1000.753,
int(geodist(lat,lon,27.3364347,-82.5306527,'M')*1e6)+1000.539,
int(geodist(lat,lon,45.2472601,-93.4553904,'M')*1e6)+1000.613,
int(geodist(lat,lon,25.7889689,-80.2264393,'M')*1e6)+1000.528,
int(geodist(lat,lon,39.737567,-104.9847179,'M')*1e6)+1000.751,
int(geodist(lat,lon,41.0923878,-82.2216083,'M')*1e6)+1000.51,
int(geodist(lat,lon,28.5383355,-81.3792365,'M')*1e6)+1000.534,
int(geodist(lat,lon,37.9175935,-121.1710389,'M')*1e6)+1000.862,
int(geodist(lat,lon,38.6270025,-90.1994042,'M')*1e6)+1000.609,
int(geodist(lat,lon,45.5234515,-122.6762071,'M')*1e6)+1000.82,
int(geodist(lat,lon,40.4406248,-79.9958864,'M')*1e6)+1000.508,
int(geodist(lat,lon,35.772096,-78.6386145,'M')*1e6)+1000.56,
int(geodist(lat,lon,35.2270869,-80.8431267,'M')*1e6)+1000.517,
int(geodist(lat,lon,39.7685825,-86.1579557,'M')*1e6)+1000.527,
int(geodist(lat,lon,39.2903848,-76.6121893,'M')*1e6)+1000.512,
int(geodist(lat,lon,32.7153292,-117.1572551,'M')*1e6)+1000.825,
int(geodist(lat,lon,36.1666667,-86.7833333,'M')*1e6)+1000.659,
int(geodist(lat,lon,32.7264162,-96.9930216,'M')*1e6)+1000.533,
int(geodist(lat,lon,39.0997265,-94.5785667,'M')*1e6)+1000.616,
int(geodist(lat,lon,39.9611755,-82.9987942,'M')*1e6)+1000.535,
int(geodist(lat,lon,40.7607793,-111.8910474,'M')*1e6)+1000.77,
int(geodist(lat,lon,43.0389025,-87.9064736,'M')*1e6)+1000.617,
int(geodist(lat,lon,39.1031182,-84.5120196,'M')*1e6)+1000.515,
int(geodist(lat,lon,29.4241219,-98.4936282,'M')*1e6)+1000.641,
int(geodist(lat,lon,35.0526635,-82.697701,'M')*1e6)+1000.567,
int(geodist(lat,lon,27.5293278,-80.35904,'M')*1e6)+1000.548,
int(geodist(lat,lon,42.9633599,-85.6680863,'M')*1e6)+1000.563,
int(geodist(lat,lon,36.114646,-115.172816,'M')*1e6)+1000.839,
int(geodist(lat,lon,35.4675602,-97.5164276,'M')*1e6)+1000.65,
int(geodist(lat,lon,33.5206608,-86.80249,'M')*1e6)+1000.63,
int(geodist(lat,lon,53.9622908,-1.0818995,'M')*1e6)+1000.566,
int(geodist(lat,lon,36.8507689,-76.2858726,'M')*1e6)+1000.544,
int(geodist(lat,lon,30.267153,-97.7430608,'M')*1e6)+1000.635,
int(geodist(lat,lon,36.0726354,-79.7919754,'M')*1e6)+1000.518,
int(geodist(lat,lon,35.2611263,-106.7234639,'M')*1e6)+1000.79,
int(geodist(lat,lon,38.2526647,-85.7584557,'M')*1e6)+1000.529,
int(geodist(lat,lon,35.1495343,-90.0489801,'M')*1e6)+1000.64,
int(geodist(lat,lon,30.3321838,-81.655651,'M')*1e6)+1000.561,
int(geodist(lat,lon,29.9510658,-90.0715323,'M')*1e6)+1000.622,
int(geodist(lat,lon,42.8864468,-78.8783689,'M')*1e6)+1000.514,
int(geodist(lat,lon,41.667668,-70.904307,'M')*1e6)+1000.521,
int(geodist(lat,lon,41.2459149,-75.8813075,'M')*1e6)+1000.577,
int(geodist(lat,lon,36.9858984,-119.2320784,'M')*1e6)+1000.866,
int(geodist(lat,lon,34.2284312,-92.0031955,'M')*1e6)+1000.693,
int(geodist(lat,lon,37.2279279,-77.4019267,'M')*1e6)+1000.556,
int(geodist(lat,lon,42.7832877,-73.8565616,'M')*1e6)+1000.532,
int(geodist(lat,lon,36.1539816,-95.992775,'M')*1e6)+1000.671,
int(geodist(lat,lon,31.1061175,-87.6151775,'M')*1e6)+1000.686,
int(geodist(lat,lon,35.9606384,-83.9207392,'M')*1e6)+1000.557,
int(geodist(lat,lon,26.225032,-81.605586,'M')*1e6)+1000.571,
int(geodist(lat,lon,39.7589478,-84.1916069,'M')*1e6)+1000.542,
int(geodist(lat,lon,38.0405837,-84.5037164,'M')*1e6)+1000.541,
int(geodist(lat,lon,36.178422,-115.213581,'M')*1e6)+1000.564,
int(geodist(lat,lon,37.6888889,-97.3361111,'M')*1e6)+1000.678,
int(geodist(lat,lon,43.5944677,-83.8888647,'M')*1e6)+1000.513,
int(geodist(lat,lon,37.2902619,-80.0342687,'M')*1e6)+1000.573,
int(geodist(lat,lon,44.266679,-88.384972,'M')*1e6)+1000.658,
int(geodist(lat,lon,32.245345,-110.9426876,'M')*1e6)+1000.789,
int(geodist(lat,lon,21.3069444,-157.8583333,'M')*1e6)+1000.744,
int(geodist(lat,lon,42.0236281,-93.6097448,'M')*1e6)+1000.679,
int(geodist(lat,lon,47.6587802,-117.4260466,'M')*1e6)+1000.881,
int(geodist(lat,lon,37.2089572,-93.2922989,'M')*1e6)+1000.619,
int(geodist(lat,lon,41.2523634,-95.9979883,'M')*1e6)+1000.652,
int(geodist(lat,lon,41.6639383,-83.555212,'M')*1e6)+1000.547,
int(geodist(lat,lon,34.0007104,-81.0348144,'M')*1e6)+1000.546,
int(geodist(lat,lon,43.16103,-77.6109219,'M')*1e6)+1000.538,
int(geodist(lat,lon,34.7303688,-86.5861037,'M')*1e6)+1000.691,
int(geodist(lat,lon,43.7123124,-70.2915077,'M')*1e6)+1000.5,
int(geodist(lat,lon,37.0833893,-88.6000478,'M')*1e6)+1000.632,
int(geodist(lat,lon,32.5251516,-93.7501789,'M')*1e6)+1000.612,
int(geodist(lat,lon,39.8403147,-88.9548001,'M')*1e6)+1000.648,
int(geodist(lat,lon,43.0481221,-76.1474244,'M')*1e6)+1000.555,
int(geodist(lat,lon,43.0730517,-89.4012302,'M')*1e6)+1000.669,
int(geodist(lat,lon,26.1906306,-97.6961026,'M')*1e6)+1000.636,
int(geodist(lat,lon,35.0456297,-85.3096801,'M')*1e6)+1000.575,
int(geodist(lat,lon,31.549333,-97.1466695,'M')*1e6)+1000.625,
int(geodist(lat,lon,38.8631375,-104.8454619,'M')*1e6)+1000.752,
int(geodist(lat,lon,41.9778795,-91.6656232,'M')*1e6)+1000.637,
int(geodist(lat,lon,31.7587198,-106.4869314,'M')*1e6)+1000.765,
int(geodist(lat,lon,32.0835407,-81.0998342,'M')*1e6)+1000.507,
int(geodist(lat,lon,32.2987573,-90.1848103,'M')*1e6)+1000.718,
int(geodist(lat,lon,30.4582829,-91.1403196,'M')*1e6)+1000.716,
int(geodist(lat,lon,41.684011,-85.970697,'M')*1e6)+1000.588,
int(geodist(lat,lon,36.4578697,-82.5341277,'M')*1e6)+1000.531,
int(geodist(lat,lon,44.6994873,-73.4529124,'M')*1e6)+1000.523,
int(geodist(lat,lon,32.7765656,-79.9309216,'M')*1e6)+1000.519,
int(geodist(lat,lon,41.5067003,-90.5151342,'M')*1e6)+1000.682,
int(geodist(lat,lon,34.8526176,-82.3940104,'M')*1e6)+1000.545,
int(geodist(lat,lon,35.3859242,-94.3985475,'M')*1e6)+1000.67,
int(geodist(lat,lon,40.3267407,-78.9219698,'M')*1e6)+1000.574,
int(geodist(lat,lon,33.882649,-79.47283,'M')*1e6)+1000.57,
int(geodist(lat,lon,37.9715592,-87.5710898,'M')*1e6)+1000.649,
int(geodist(lat,lon,41.282784,-99.257628,'M')*1e6)+1000.722,
int(geodist(lat,lon,30.5635855,-84.2152225,'M')*1e6)+1000.53,
int(geodist(lat,lon,32.1070105,-94.8806935,'M')*1e6)+1000.709,
int(geodist(lat,lon,39.5296329,-119.8138027,'M')*1e6)+1000.811,
int(geodist(lat,lon,41.079273,-85.1393513,'M')*1e6)+1000.509,
int(geodist(lat,lon,41.0997803,-80.6495194,'M')*1e6)+1000.536,
int(geodist(lat,lon,43.6140237,-116.20365,'M')*1e6)+1000.757,
int(geodist(lat,lon,43.5335197,-96.7160928,'M')*1e6)+1000.725,
int(geodist(lat,lon,33.4734978,-82.0105148,'M')*1e6)+1000.52,
int(geodist(lat,lon,42.2042586,-72.6162009,'M')*1e6)+1000.543,
int(geodist(lat,lon,42.732535,-84.5555347,'M')*1e6)+1000.551,
int(geodist(lat,lon,40.4904902,-89.0611888,'M')*1e6)+1000.675,
int(geodist(lat,lon,32.6562137,-83.8796733,'M')*1e6)+1000.724,
int(geodist(lat,lon,32.3045474,-86.4133412,'M')*1e6)+1000.698,
int(geodist(lat,lon,44.2467827,-85.3911776,'M')*1e6)+1000.54,
int(geodist(lat,lon,32.8406946,-83.6324022,'M')*1e6)+1000.503,
int(geodist(lat,lon,44.0520691,-123.0867536,'M')*1e6)+1000.801,
int(geodist(lat,lon,46.6020711,-120.5058987,'M')*1e6)+1000.81,
int(geodist(lat,lon,34.844663,-120.4100205,'M')*1e6)+1000.855,
int(geodist(lat,lon,30.2240897,-92.0198427,'M')*1e6)+1000.642,
int(geodist(lat,lon,36.6002378,-121.8946761,'M')*1e6)+1000.828,
int(geodist(lat,lon,35.3732921,-119.0187125,'M')*1e6)+1000.8,
int(geodist(lat,lon,32.6423608,-85.3343965,'M')*1e6)+1000.522,
int(geodist(lat,lon,44.811349,-91.4984941,'M')*1e6)+1000.702,
int(geodist(lat,lon,27.8005828,-97.396381,'M')*1e6)+1000.6,
int(geodist(lat,lon,35.2219971,-101.8312969,'M')*1e6)+1000.634,
int(geodist(lat,lon,39.7572049,-121.8301871,'M')*1e6)+1000.868,
int(geodist(lat,lon,34.2257255,-77.9447102,'M')*1e6)+1000.55,
int(geodist(lat,lon,39.9611755,-82.9987942,'M')*1e6)+1000.673,
int(geodist(lat,lon,45.610836,-89.526737,'M')*1e6)+1000.705,
int(geodist(lat,lon,42.2711311,-89.0939952,'M')*1e6)+1000.61,
int(geodist(lat,lon,39.0558235,-95.6890185,'M')*1e6)+1000.605,
int(geodist(lat,lon,33.2152498,-92.6406703,'M')*1e6)+1000.628,
int(geodist(lat,lon,38.8645565,-77.1877587,'M')*1e6)+1000.604,
int(geodist(lat,lon,46.7207737,-92.1040796,'M')*1e6)+1000.676,
int(geodist(lat,lon,42.1224722,-122.3641035,'M')*1e6)+1000.813,
int(geodist(lat,lon,29.9851829,-94.061336,'M')*1e6)+1000.692,
int(geodist(lat,lon,33.5778631,-101.8551665,'M')*1e6)+1000.651,
int(geodist(lat,lon,34.6035669,-98.3959291,'M')*1e6)+1000.627,
int(geodist(lat,lon,51.068785,-1.794472,'M')*1e6)+1000.576,
int(geodist(lat,lon,61.2180556,-149.9002778,'M')*1e6)+1000.743,
int(geodist(lat,lon,42.1292241,-80.085059,'M')*1e6)+1000.516,
int(geodist(lat,lon,42.4999942,-96.4003069,'M')*1e6)+1000.624,
int(geodist(lat,lon,33.8302961,-116.5452921,'M')*1e6)+1000.804,
int(geodist(lat,lon,37.424084,-94.7003079,'M')*1e6)+1000.603,
int(geodist(lat,lon,31.5785074,-84.155741,'M')*1e6)+1000.525,
int(geodist(lat,lon,48.2325095,-101.2962732,'M')*1e6)+1000.687,
int(geodist(lat,lon,31.8456816,-102.3676431,'M')*1e6)+1000.633,
int(geodist(lat,lon,30.267153,-97.7430608,'M')*1e6)+1000.611,
int(geodist(lat,lon,39.4667034,-87.4139092,'M')*1e6)+1000.581,
int(geodist(lat,lon,44.8011821,-68.7778138,'M')*1e6)+1000.537,
int(geodist(lat,lon,37.7054122,-81.2470144,'M')*1e6)+1000.559,
int(geodist(lat,lon,42.0986867,-75.9179738,'M')*1e6)+1000.502,
int(geodist(lat,lon,40.3174072,-80.6035494,'M')*1e6)+1000.554,
int(geodist(lat,lon,8.9942689,-79.518792,'M')*1e6)+1000.656,
int(geodist(lat,lon,30.41338,-89.072958,'M')*1e6)+1000.746,
int(geodist(lat,lon,43.628367,-116.2016511,'M')*1e6)+1000.657,
int(geodist(lat,lon,44.0682019,-114.7420408,'M')*1e6)+1000.758,
int(geodist(lat,lon,29.6516344,-82.3248262,'M')*1e6)+1000.592,
int(geodist(lat,lon,32.658465,-100.1068496,'M')*1e6)+1000.662,
int(geodist(lat,lon,32.6926512,-114.6276916,'M')*1e6)+1000.771,
int(geodist(lat,lon,46.8605189,-114.019501,'M')*1e6)+1000.762,
int(geodist(lat,lon,31.3231637,-89.2822736,'M')*1e6)+1000.71,
int(geodist(lat,lon,45.7832856,-108.5006904,'M')*1e6)+1000.756,
int(geodist(lat,lon,31.2232313,-85.3904888,'M')*1e6)+1000.606,
int(geodist(lat,lon,39.2806451,-80.3445341,'M')*1e6)+1000.598,
int(geodist(lat,lon,42.2528772,-71.0022705,'M')*1e6)+1000.717,
int(geodist(lat,lon,43.100903,-75.232664,'M')*1e6)+1000.526,
int(geodist(lat,lon,44.0805434,-103.2310149,'M')*1e6)+1000.764,
int(geodist(lat,lon,42.0930035,-76.7995871,'M')*1e6)+1000.565,
int(geodist(lat,lon,30.2265949,-93.2173758,'M')*1e6)+1000.643,
int(geodist(lat,lon,35.6145169,-88.8139469,'M')*1e6)+1000.639,
int(geodist(lat,lon,42.3709299,-71.1828321,'M')*1e6)+1000.549,
int(geodist(lat,lon,38.4495688,-78.8689155,'M')*1e6)+1000.569,
int(geodist(lat,lon,31.3112936,-92.4451371,'M')*1e6)+1000.644,
int(geodist(lat,lon,46.5475825,-87.3955954,'M')*1e6)+1000.553,
int(geodist(lat,lon,35.8422967,-90.704279,'M')*1e6)+1000.734,
int(geodist(lat,lon,36.9903199,-86.4436018,'M')*1e6)+1000.736,
int(geodist(lat,lon,38.0293059,-78.4766781,'M')*1e6)+1000.584,
int(geodist(lat,lon,27.506407,-99.5075421,'M')*1e6)+1000.749,
int(geodist(lat,lon,38.9585381,-108.6175626,'M')*1e6)+1000.773,
int(geodist(lat,lon,43.6121087,-116.3915131,'M')*1e6)+1000.711,
int(geodist(lat,lon,45.6703654,-111.2482862,'M')*1e6)+1000.754,
int(geodist(lat,lon,34.7074745,-82.3017728,'M')*1e6)+1000.647,
int(geodist(lat,lon,40.4167022,-86.8752869,'M')*1e6)+1000.582,
int(geodist(lat,lon,47.5002354,-111.3008083,'M')*1e6)+1000.755,
int(geodist(lat,lon,42.5629668,-114.4608711,'M')*1e6)+1000.76,
int(geodist(lat,lon,44.0581728,-121.3153096,'M')*1e6)+1000.821,
int(geodist(lat,lon,39.2667418,-81.5615135,'M')*1e6)+1000.597,
int(geodist(lat,lon,40.8020712,-124.1636729,'M')*1e6)+1000.802,
int(geodist(lat,lon,41.876312,-103.637984,'M')*1e6)+1000.759,
int(geodist(lat,lon,31.4637723,-100.4370375,'M')*1e6)+1000.661,
int(geodist(lat,lon,42.8335479,-107.7695161,'M')*1e6)+1000.767,
int(geodist(lat,lon,44.1635775,-93.9993996,'M')*1e6)+1000.737,
int(geodist(lat,lon,-12.0478158,-77.0622028,'M')*1e6)+1000.558,
int(geodist(lat,lon,40.1947539,-92.5832496,'M')*1e6)+1000.631,
int(geodist(lat,lon,39.757944,-94.836541,'M')*1e6)+1000.638,
int(geodist(lat,lon,64.8377778,-147.7163889,'M')*1e6)+1000.745,
int(geodist(lat,lon,39.9403453,-82.0131924,'M')*1e6)+1000.596,
int(geodist(lat,lon,-37.4713077,144.7851531,'M')*1e6)+1000.626,
int(geodist(lat,lon,46.681153,-68.0158615,'M')*1e6)+1000.552,
int(geodist(lat,lon,46.5958056,-112.0270306,'M')*1e6)+1000.766,
int(geodist(lat,lon,58.3019444,-134.4197222,'M')*1e6)+1000.747,
int(geodist(lat,lon,44.1813767,-98.365646,'M')*1e6)+1000.583,
int(geodist(lat,lon,41.1238873,-100.7654232,'M')*1e6)+1000.74,
int(geodist(lat,lon,47.1086111,-104.7105556,'M')*1e6)+1000.798
);
DMA=int((temp-int(temp))*1000);
|
Rare event regression using Proc Logistic |
---|
Popular statistical procedure, such as logistic regression, sharply underestimate the probability of rare events Solution: More efficient sampling designs exist for making valid inference: sampling all available events and a tiny fraction of nonevents. This kind of sampling is also called oversampling, retrospective sampling, biased sampling, or choice-based sampling. In the biological sciences, studies using this kind of sampling are known as case-control studies. Parameter and odds ratio estimates of the covariates (and their confidence limits) are unaffected by this type of stratified sampling . However, the intercept estimate is affected by the sampling, so any computation that is based on the full set of parameter estimates is incorrect, such as the predicted event probabilities, differences or ratios of event probabilities. If you know the probabilities of events and nonevents in the population, then you can adjust the intercept either by weighting or by using an offset. Adjusting the Intercept: To adjust by weighting, add a variable to your data set that takes the value p1/r1 in event observations, and the value (1-p1)/(1-r1) in nonevent observations, where p1 is the probability of an event in the population and r1 is the proportion of events in your data set. Specify this variable in the WEIGHT statement in PROC LOGISTIC. Or,to adjust by using an offset, add a variable to your data set defined as log[(r1*(1-p1)) / ((1-r1)*p1)], where log represents the natural logarithm. Specify this variable in the OFFSET= option of the MODEL statement in PROC LOGISTIC. Below is an example. data full;
do i=1 to 1000;
x=rannor(12342);
p=1/(1+exp(-(-3.35+2*x)));
y=ranbin(98435,1,p);
drop i;
output;
end;
run;
data sub;
set full;
if y=1 or (y=0 and ranuni(75302)<1/9) then output;
run;
proc freq data=full;
table y / out=fullpct(where=(y=1) rename=(percent=fullpct));
title "response counts in full data set";
run;
proc freq data=sub;
table y / out=subpct(where=(y=1) rename=(percent=subpct));
title "Response counts in oversampled, subset data set";
run;
data sub;
set sub;
if _n_=1 then set fullpct(keep=fullpct);
if _n_=1 then set subpct(keep=subpct);
p1=fullpct/100; r1=subpct/100;
w=p1/r1; if y=0 then w=(1-p1)/(1-r1);
off=log( (r1*(1-p1)) / ((1-r1)*p1) );
run;
ods select parameterestimates(persist);
proc logistic data=sub;
model y(event="1")=x;
output out=out p=pnowt;
title "True Parameters: -3.35 (intercept), 2 (X)";
title2 "Unadjusted Model";
run;
proc logistic data=out;
model y(event="1")=x; weight w;
output out=out p=pwt;
title2 "Weight-adjusted Model";
run;
proc logistic data=out;
model y(event="1")=x / offset=off;
output out=out xbeta=xboff;
title2 "Offset-adjusted Model";
run;
data out;
set out;
poff=logistic(xboff-off);
run;
proc freq data=full noprint;
table y / out=priors(drop=percent rename=(count=_prior_));
run;
proc logistic data=out;
model y(event="1")=x;
score data=sub prior=priors out=out2;
title2 "Unadjusted Model; Prior-adjusted probabilities";
run;
|
Using PROC SQL on a many-to-one merge to calculate counts across date ranges |
---|
proc sql noprint;
create table TABLE_WITH_EVENT_DATES_AND_DEVICE_CNTS as
select customer_id,dait,sum(cnt) as ncnt from
(
select a.customer_id,a.dait,
case when (TIME_DEREGISTERED^=. and FIRST_TIME_REGISTERED<=dait<TIME_DEREGISTERED) or
(TIME_DEREGISTERED=. and FIRST_TIME_REGISTERED<dait) then 1 else 0 end as cnt
from TABLE_WITH_EVENT_DATES a left join TABLE_WITH_DEVICE_REGS_AND_DEREGS b
on a.customer_id=b.customer_id
)
group by customer_id,dait;
quit;
|
Winsorized and Trimmed Means using PROC UNIVARIATE source |
---|
ods select TrimmedMeans WinsorizedMeans RobustScale;
proc univariate data=BPressure trimmed=1 .1 winsorized=.1 robustscale;
var Systolic;
run;
|
Propensity Score Modeling Macro source |
---|
%macro PSM_method1(indata=,outdata=,estimates=,results=,event=,yvar=,xvars=,ngroups=,alpha=);
proc logistic data=&indata descending noprint outest=est;
model &event=&xvars/selection=stepwise;
output out=scored p=phat_event;
quit;
proc rank data=scored out=temp(keep=phat_event r_PHAT_event) group=&ngroups;
var phat_event;
ranks r_phat_event;
where &event=1;
run;
proc means nway noprint data=temp;
class r_phat_event;
var phat_event;
output out=ranges
min=min_phat_event
max=max_phat_event;
run;
data _NULL_;
set ranges end=eof;
call symput(compress("r_"||put(_N_,best10.)),r_phat_event);
call symput(compress("mx_"||put(_N_,best10.)),max_phat_event);
if eof then call symput("count",_N_);
run;
data &outdata;
set scored;
if .<phat_event<=&mx_1 then phat_event_SCORE=&r_1;
%do j=2 %to %eval(&count-1);
else if phat_event<=&&mx_&j then phat_event_SCORE=&&r_&j;
%end;
%let k=%eval(&j-1);
else if phat_event>&&mx_&k then phat_event_SCORE=&&r_&j;
run;
proc means noprint nway data=&outdata alpha=α
class phat_event_SCORE &event;
var &yvar;
output out=&results(drop=_TYPE_ _FREQ_)
N=n_&yvar
mean=mn_&yvar
std=sd_&yvar;
run;
%mend PSM_method1;
%PSM_method1(indata=,outdata=,estimates=,results=,event=,yvar=,xvars=,ngroups=5,alpha=.05);
|
Secure data analysis of vertically partitioned data using PROC IML source |
---|
*NOTE: both parties data must be in the same order and only be of the common ids;
*Create party1 data;
data party1;
do i=1 to 20;
id1=i;
x=(ranuni(0)>0.5);
rnd=ranuni(0);
output;
end;
drop i;
run;
proc sort data=party1; by rnd;
data comb1; set party1; retain _t; if 10<_N_<=20 then do; _t=sum(_t,1); exp_id=_t; end; else exp_id=-1*_N_; drop _t rnd; run;
*Create party2 data;
data party2;
do i=1 to 40;
id2=i;
y=ranuni(0)*10000;
rnd=ranuni(0);
output;
end;
drop i;
run;
proc sort data=party2; by rnd;
data comb2; set party2; retain _t; if 10<_N_<=20 then do; _t=sum(_t,1); exp_id=_t; end; else exp_id=-1*_N_; drop _t rnd; run;
*Join together party1 and party2 data so we can analyze it;
proc sort data=comb1; by exp_id;
proc sort data=comb2; by exp_id;
data both;
merge comb1(in=a) comb2(in=b);
by exp_id;
if a=1 and b=1 and exp_id>0;
drop id1 id2;
run;
option nocenter;
PROC IML;
*Read in party1 and party2 data into separate matrices;
use both;
read all var{x} into party2;
read all var{y} into party1;
close both;
n=nrow(party2); p=ncol(party2);
g=ceil((n-p)/2); /* round up */
print n p g;
call QR(q,r,piv,lindep,party2);
ng=n-p;
*pick g cols of q from col p+1 to n;
_t=t((p+1):n)||uniform(repeat(0,ng,1));
print _t;
call sort(_t,2);
_t2=t(_t[1:g,1]);
print _t2;
*calculate z and send to party1;
z=q[,_t2];
*check=t(z)*party2; *must equal 0;
*check=t(z)*z; *must equal I;
*print check;
*calculate w and send back to party2;
w=(I(n)-z*t(z))*party1;
print w;
*note you cannot figure out party1 data from w;
*y calc below will fail;
*y=inv(1-z*t(z))*w;
*print y party1;
*the secure answer calculated by party2;
ANS=inv(t(party2)*party2)*t(party2)*w;
print "the secure answer:" ANS;
*the correct answer;
check=inv(t(party2)*party2)*t(party2)*party1;
print "the correct answer:" check;
quit;
|
How to send SAS output in e-mail |
---|
filename outbox email
to='susan@mvs'
type='text/html'
subject='Temperature conversions'
;
data temperatures;
do centigrade = -40 to 100 by 10;
fahrenheit = centigrade*9/5+32;
output;
end;
run;
ods html
body=outbox /* Mail it! */
rs=none;
title 'Centigrade to Fahrenheit conversion table';
proc print;
id centigrade;
var fahrenheit;
run;
ods html close;
|
Variable Selection for PROC LOGISTIC and large data |
---|
/*
PROC LOGISTIC uses an iterative algorithm to produce its regression results.
This can be a problem when dealing with large data, lots of x-variables, and
a variable selection problem.
This macro does efficient variable selection for PROC LOGISTIC using PROC TTEST.
*/
%macro logreg(indata=,idvar=,tcut=,wtvar=,yvar=,xvars=,runttest=Y,runreg=N,outttest=,outmodel=,runscore=,inscore=,outscore=);
%if &runttest=Y %then %do;
ods listing close;
ods output TTests=&outttest;
proc ttest data=&indata;
class &yvar;
var &xvars;
run;
%end; /* ttest */
ods listing;
data ods_ttests2;
set &outttest;
tValue2=abs(tValue);
if upcase(trim(Variable)) ne upcase(trim("&yvar")) and Variances="Equal" and Probt<0.0001 and tValue2>&tcut;
run;
proc sort data=ods_ttests2; by descending tValue2;
data _null_;
set ods_ttests2 end=eof;
call symput(compress("xvar"||put(_N_,best10.)),Variable);
call symput(compress("tval"||put(_N_,best10.)),tValue2);
call symput(compress("pval"||put(_N_,best10.)),probt);
if eof then call symput("nxvar",_N_);
run;
%do i=1 %to &nxvar;
%put x-var&i = &&xvar&i, abs_tval&i = &&tval&i, pval&i = &&pval&i;
%end;
%put number of x-vars = &nxvar;
%if &runreg=Y %then %do;
data small(keep=&idvar &yvar &wtvar %do i=1 %to &nxvar; &&xvar&i %end;);; set &indata;
ods output RSquare=ods_rsq;
proc logistic descending outmodel=&outmodel data=small;
model &yvar=%do i=1 %to &nxvar; &&xvar&i %end; /rsquare;;
weight &wtvar;
output out=scored(keep=&idvar &wtvar &yvar phat where=(&wtvar=0)) p=phat;
quit;
proc rank data=scored out=scored group=10;
var phat;
ranks r_phat;
run;
proc means mean data=scored;
class r_phat;
var &yvar;
run;
%end; /* logistic */
%if &runscore=Y %then %do;
proc logistic inmodel=&outmodel;
score data=&inscore out=&outscore(keep=customer_id P_1);
run;
%end; /* score */
%mend logreg;
%logreg(runttest=N,
runreg=N,
runscore=Y,
indata=L.REG_model_sm,
idvar=customer_id,
wtvar=wt,
tcut=3,
yvar=V_Y_FIRE,
xvars=V_:,
outttest=temp_ttest,
outmodel=REG_est,
inscore=L.REG_model_sm,
outscore=reg_score);
|
Bootstrap with bias correction source |
---|
/* Example of Bootstrap with bias correction
http://www.ats.ucla.edu/stat/sas/faq/bootstrap.htm */
/* Determine random samples */
proc surveyselect data=L.fashion6_wk out=sample method=urs sampsize=200 reps=1000 seed=377183 noprint;
samplingunit CUSTOMER_ID;
run;
/* Run on all data for bias correction later */
ods listing close;
ods output ParameterEstimates=pe0(where=(index(Parameter,"adx")>0) keep=estimate parameter);
proc glm data=L.fashion6_wk;
model ln_OPS_TOTAL=
tenure
dum_C_COSMO
dum_C_ELLE
dum_C_HARP
dum_C_ALLURE
dum_T_GLAM
dum_T_INSTY
dum_T_LUCKY
dum_T_MCLAIR
dum_T_VOGUE
dum_T_WWD
AD_PERIOD_GLAM
AD_PERIOD_INSTY
AD_PERIOD_LUCKY
AD_PERIOD_MCLAIR
AD_PERIOD_VOGUE
AD_PERIOD_WWD
ADX_GLAM
ADX_INSTY
ADX_LUCKY
ADX_MCLAIR
ADX_VOGUE
ADX_WWD
tenure/solution;
;
quit;
ods listing;
proc transpose data=pe0 out=t_pe0(drop=_NAME_) prefix=_;
id parameter;
var estimate;
run;
/* Run for each replicate */
ods listing close;
ods output ParameterEstimates=pe(where=(index(Parameter,"adx")>0) keep=replicate estimate parameter);
proc glm data=sample;
by replicate;
model ln_OPS_TOTAL=
tenure
dum_C_COSMO
dum_C_ELLE
dum_C_HARP
dum_C_ALLURE
dum_T_GLAM
dum_T_INSTY
dum_T_LUCKY
dum_T_MCLAIR
dum_T_VOGUE
dum_T_WWD
AD_PERIOD_GLAM
AD_PERIOD_INSTY
AD_PERIOD_LUCKY
AD_PERIOD_MCLAIR
AD_PERIOD_VOGUE
AD_PERIOD_WWD
ADX_GLAM
ADX_INSTY
ADX_LUCKY
ADX_MCLAIR
ADX_VOGUE
ADX_WWD
tenure/solution;
;
quit;
ods listing;
proc transpose data=pe out=t_pe(drop=_NAME_);
by replicate;
id parameter;
var estimate;
run;
/* Begin bias correction calculation */
data t_pe;
if _n_=1 then set t_pe0;
set t_pe;
array x ADX_GLAM ADX_INSTY ADX_LUCKY ADX_MCLAIR ADX_VOGUE ADX_WWD;
array y _ADX_GLAM _ADX_INSTY _ADX_LUCKY _ADX_MCLAIR _ADX_VOGUE _ADX_WWD;
array z b_ADX_GLAM b_ADX_INSTY b_ADX_LUCKY b_ADX_MCLAIR b_ADX_VOGUE b_ADX_WWD;
do over z; z=(x<=y); end;
run;
proc means data=t_pe noprint;
var b_:; output out=bias mean=;
run;
%let alphalev = .2;
%let alpha1 = %sysevalf(1 - &alphalev/2);
%put &alpha1;
data bias2;
set bias;
array x b_ADX_GLAM b_ADX_INSTY b_ADX_LUCKY b_ADX_MCLAIR b_ADX_VOGUE b_ADX_WWD;
array a p1_ADX_GLAM p1_ADX_INSTY p1_ADX_LUCKY p1_ADX_MCLAIR p1_ADX_VOGUE p1_ADX_WWD;
array b p2_ADX_GLAM p2_ADX_INSTY p2_ADX_LUCKY p2_ADX_MCLAIR p2_ADX_VOGUE p2_ADX_WWD;
do over x;
z0 = probit(x);
zalpha = probit(&alpha1);
a = put(probnorm(2*z0 - zalpha)*100, 3.0);
b = put(probnorm(2*z0 + zalpha)*100, 3.0);
call symput(compress(vname(a)),trim(a));
call symput(compress(vname(b)),trim(b));
end;
run;
/* Determine confidence intervals */
proc univariate data=t_pe noprint; var ADX_GLAM;
output out=f1 pctlpts=&p1_ADX_GLAM &p2_ADX_GLAM pctlpre=p pctlname = _lb _ub mean=mn; data f1; length _label_ $12.; set f1; _label_="ADX_GLAM";
proc univariate data=t_pe noprint; var ADX_INSTY;
output out=f2 pctlpts=&p1_ADX_INSTY &p2_ADX_INSTY pctlpre=p pctlname = _lb _ub mean=mn; data f2; length _label_ $12.; set f2; _label_="ADX_INSTY";
proc univariate data=t_pe noprint; var ADX_LUCKY;
output out=f3 pctlpts=&p1_ADX_LUCKY &p2_ADX_LUCKY pctlpre=p pctlname = _lb _ub mean=mn; data f3; length _label_ $12.; set f3; _label_="ADX_LUCKY";
proc univariate data=t_pe noprint; var ADX_MCLAIR;
output out=f4 pctlpts=&p1_ADX_MCLAIR &p2_ADX_MCLAIR pctlpre=p pctlname = _lb _ub mean=mn; data f4; length _label_ $12.; set f4; _label_="ADX_MCLAIR";
proc univariate data=t_pe noprint; var ADX_VOGUE;
output out=f5 pctlpts=&p1_ADX_VOGUE &p2_ADX_VOGUE pctlpre=p pctlname = _lb _ub mean=mn; data f5; length _label_ $12.; set f5; _label_="ADX_VOGUE";
proc univariate data=t_pe noprint; var ADX_WWD;
output out=f6 pctlpts=&p1_ADX_WWD &p2_ADX_WWD pctlpre=p pctlname = _lb _ub mean=mn; data f6; length _label_ $12.; set f6; _label_="ADX_WWD";
run;
/* Stack results */
data final;
set f1 f2 f3 f4 f5 f6;
attrib _all_ label="";
run;
|
PROC RANK workaround macro for large data |
---|
/* Author: Rickmc PROC RANK uses RAM and, for large data, can run out of memory. Hence this macro. */
%MACRO QASG(_indata=,_byvar=,_var=,_qvar=,_outdata=,_groups=);
%if &_byvar^= %then %do;
proc sort data=&_indata; by &_byvar &_var;
data _temp1(keep=&_byvar &_var mean_rnk)
_temp2(keep=&_byvar rc rename=(rc=dsobs));
set &_indata;
by &_byvar &_var;
retain xc rc rnk;
if first.&_byvar then do; xc=0; rc=0; rnk=0; end;
if first.&_var then do; xc=0; rnk=0; end;
xc=xc+1;
rc=rc+1;
rnk=rnk+rc;
mean_rnk=rnk/xc;
if last.&_byvar then output _temp2;
if last.&_var then output _temp1;
run;
proc sort data=_temp1; by &_byvar;
proc sort data=_temp2; by &_byvar;
data _temp3(keep=&_byvar &_var &_qvar);
merge _temp1 _temp2;
by &_byvar;
&_qvar=floor(mean_rnk * &_groups / (dsobs + 1))+1;
run;
proc sort data=_temp3; by &_byvar &_var;
data &_outdata;
merge &_indata _temp3;
by &_byvar &_var;
run;
%end;
%if &_byvar= %then %do;
proc sort data=&_indata; by &_var;
proc sql noprint;
select count(*) into :dsobs from &_indata;
create table _temp1 as
select &_var,floor(mean(MONOTONIC())*&_groups/(&dsobs+1))+1 as &_qvar
from &_indata
group by &_var
order by &_var;
quit;
data &_outdata;
merge &_indata _temp1;
by &_var;
run;
%end;
proc datasets nofs nolist library=work memtype=(data); delete _temp:; quit;
%MEND QASG;
%QASG(_indata=L.TEMP,_byvar=,_var=_VAR1,_qvar=_Q1,_outdata=TEMP1);
|
Simple Bootstrap Approach source |
---|
/* Using Survey Select which uses RAM. Fastest. */
sasfile YourData load;
proc surveyselect data=YourData out=outboot seed=30459584 method=urs samprate=1 outhits rep=1000;
run;
sasfile YourData close;
*thing you want to bootstrap;
ods listing close;
proc univariate data=outboot;
var x;
by Replicate;
output out=outall kurtosis=curt;
run;
ods listing;
proc univariate data=outall;
var curt;
output out=final pctlpts=2.5, 97.5 pctlpre=ci;
run;
/* Without Survey Select which requires lots of RAM. Req's lots of disk space. */
sasfile YourData load;
data outboot(drop=__i);
do Replicate = 1 to 1000;
do __i = 1 to numrecs;
p = int(1 + numrecs*(ranuni(39573293)));
set YourData point=p nobs=numrecs;
output;
end;
end;
run;
*thing you want to bootstrap;
ods listing close;
proc univariate data=outboot;
var x;
by Replicate;
output out=outall kurtosis=curt;
run;
ods listing;
proc univariate data=outall;
var curt;
output out=final pctlpts=2.5, 97.5 pctlpre=ci;
run;
/* Without Survey Select and uses least amount of disk space. Slowest. */
data outboot(drop=__i) / view = outboot; /* 1 */
do Replicate = 1 to 1000;
do __i = 1 to numrecs;
p = int(1 + numrecs*(ranuni(39573293)));
set YourData point=p nobs=numrecs;
output;
end;
end;
run;
*thing you want to bootstrap;
ods listing close;
proc univariate data=outboot;
var x;
by Replicate;
output out=outall kurtosis=curt;
run;
ods listing;
proc univariate data=outall;
var curt;
output out=final pctlpts=2.5, 97.5 pctlpre=ci;
run;
|
Use PROC POWER to calculate sample sizes and powers source |
---|
/* two-sample proportions */
proc power;
twosamplefreq test=PCHI
refproportion = 0.0005
proportiondiff = 0.0001
power = .9
alpha= 0.05
npergroup = .
sides = 1;
run;
/* one sample t-test */
* power = ? ;
proc power;
onesamplemeans
alpha=0.05
sides=2
nullm=20
mean=22
stddev=4
ntotal=44
power=.;
run;
* sample size = ? ;
proc power;
onesamplemeans
alpha=0.01
sides=u /* U: upper one-sided, L: lower one-sided */
nullm=20
mean=22
stddev=4
ntotal=.
power=0.8;
run;
/* paired t-test */
proc power;
pairedmeans test=diff
alpha=.01
sides=2
meandiff=3
stddev=3.5
corr=.2
npairs=20 30 40
power=.;
run;
/* independent t-test */
proc power;
twosamplemeans
meandiff=3 to 4 by .5 /* the same as 3 3.5 4 */
stddev=8 to 9 by .5
groupweights=(1 1)
power=0.8
ntotal=.;
plot y=power min=0.5 max=0.99;
run;
* a different way;
proc power;
twosamplemeans
groupmeans=(13 14) (13 14.5) (13 15) /* same as 13|14 14.5 15 */
stddev=1.2 1.7
groupweights=1|1 2 3 /* same as (1 1) (1 2) (1 3) */
power=0.8
ntotal=.;
run;
* Power vs Effect Size;
proc power;
twosamplemeans test=diff
meandiff=0 to 2.5 by 0.5
stddev=.5657 1.0 1.4318
power=.
npergroup=10;
plot x=effect interpol=join;
run;
/* Multiple Regression */
proc power;
multreg
model=random
nfullpredictors=7
ntestpredictors=1
partialcorr=0.35
ntotal=100
power=.;
plot x=n min=50 max=150;
run;
/* One-way ANOVA */
proc power;
onewayanova test=overall
alpha=.05
groupmeans=(5 7 3 11)
stddev=4 5 6
npergroup=10 15
power=.;
run;
/* One sample proportion */
proc power;
onesamplefreq test=z method=normal /* test=adjz with continuity corrrection */
sides=1 /* one-sided */
alpha=.05
nullproportion=0.3
proportion=.2
ntotal=.
power=.8;
run;
/* Fisher's exact test */
proc power;
twosamplefreq test=fisher
proportiondiff=0.10 to 0.15 by 0.01
refproportion=.2
npergroup=150
power=.;
run;
/* LR Chi-square Test for Two Proportions */
/* test=pchi for Pearson Chi-square Test for Two Proportions */
proc power;
twosamplefreq test=lrchi
proportiondiff=0.10 to 0.15 by 0.01
refproportion=.2
npergroup=150
power=.;
run;
/* Correlation */
proc power;
onecorr dist=fisherz
npvars=6
corr=.35
nullcorr=.2
sides=1
ntotal=100
power=.;
run;
/* comparing 2 survival curves */
proc power;
twosamplesurvival test=logrank
gexphs=0.3567 | 0.5978 .6931
grouplossexphazards=(0.3567 0.3567)
accrualtime=1
followuptime=1
groupweights=(1 2)
power=.
ntotal=225;
run;
/* TOST */
proc power;
twosamplemeans test=equiv_ratio
lower=.8
upper=1.25
meanratio=1 1.2
cv=.1 .2 .3
npergroup=.
power= .8 .9;
run;
|
Trick to flip order of CLASS groups in a PROC |
---|
proc format;
value flipref
1 = '1:Test'
0 = '2:Control';
run;
PROC GLM DATA=small;
format TC ad_period flipref.;
CLASS TC ad_period;
MODEL OPS_TOTAL =TC|ad_period / solution clparm alpha=0.2;
quit;
|
Difference in Difference (DID) Analysis using SAS |
---|
There are two possible ways to proceed assuming you are working with a
continuous (normal distribution) response.
Structure the file with a value of diff = y2 - y1, where y1 and y1 are
collected at time 1 and 2 respectively:
Dataset one:
id y1 y2 diff group
1 9 8 1 1
2 8 10 -2 1
..
11 8 6 2 2
12 6 9 -3 2
Then run:
ODS SELECT tests3 solutionF;
proc mixed data=one;
class group id;
model diff = group / solution;
LSMEANS group / diff;
run;
The difference of a difference is the coefficient for the 'group' or the
output from LSMEANS diff option.
Or you can structure it as dataset two:
Id y tm group
1 9 1 1
1 8 2 1
2 8 1 1
2 10 2 1
..
11 8 1 2
11 6 2 2
12 6 1 2
12 9 2 2
You need to add the REPEATED statement to account for the correlation
between y1 and y2:
ODS SELECT tests3 solutionF estimates rcorr;
PROC MIXED DATA= two;
class group tm id;
MODEL y = group|tm / solution;
ESTIMATE 'group 1, time 1 - 2' tm 1 -1 group*tm 1 -1 0 0;
ESTIMATE 'group 2, time 1 - 2' tm 1 -1 group*tm 0 0 1 -1;
ESTIMATE 'Diff of diff' group*tm 1 -1 -1 1;
REPEATED tm / subject=id type=un rcorr;
run;
The diff of a diff is either the value of the final ESTIMATE statement,
or the non-zero coefficient for the group*tm interaction term from the
solutionF table.
You can also add in covariates to each model; the value of the pretest,
y1 is often of interest with dataset one, as well as covariates that
remain constant at time 1 and time 2. If you have 'time-depedent'
covariates, then option 2 is needed. It is important to note that from
starting reference above that they both are equivalent.
|
Code to assign a unique number to each row of a dataset without sorting. As long as you have less than 2^31-1 rows |
---|
data new;
* fake_id is the new id;
if _n_=1 then do;
fake_id=0;
call ranuni(fake_id,dummy);
put "original seed = " fake_id;
retain fake_id;
end;
set old;
call ranuni(fake_id,dummy);
drop dummy;
run;
|
Use proc format to merge a small-ish data set to a large data set that is too a big to sort |
---|
data fmt1(keep = start label fmtname);
set smaller_segment_dataset;
length label $32.;
start = customer_id; * join variable *;
label = segment; * variable to add *;
fmtname = '$fmt_char';
run;
data fmt2(keep = start label fmtname);
set smaller_dma_dataset;
start = address_id; * join variable *;
label = dma; * variable to add *;
fmtname = "fmt_num";
run;
proc format cntlin=fmt1;
proc format cntlin=fmt2;
run;
data big_small_datasets_merged;
set big_dataset;
* to join a numeric variable *;
dma= put(address_id, fmt_num.) *1;
* to join a character variable *;
length segment $32.; segment = put(customer_id, $fmt_char.);
run;
|
Build a date range that increments by day (also by month) |
---|
%macro make_data(start=,stop=);
%let StartDate = %Sysfunc( InputN( &start , mmddyy10 ));
%let StopDate = %Sysfunc( InputN( &stop , mmddyy10 ));
%let i=1;
%let ndate&i=&StartDate; %let date&i=%Sysfunc( PutN( &&ndate&i, mmddyy10 ) );
%do %while(&&ndate&i<=&StopDate);
%let datep1=%eval(&&ndate&i+1);
%let i=%eval(&i+1);
%let ndate&i = &datep1;
%let date&i= %Sysfunc( PutN( &datep1,yymmdd10 ) );
%end;
%let ndays=%eval(&i-1);
%do i=1 %to &ndays;
%put date&i=&&date&i;
%end;
%mend make_data;
%make_data(start=01/01/2008,stop=12/01/2009);
%macro make_data(start_month=,end_month=);
%let s_mo=%substr(&start_month,1,2);
%let s_yr=%substr(&start_month,7,4);
%let e_mo=%substr(&end_month,1,2);
%let e_yr=%substr(&end_month,7,4);
%let i=1;
%let date&i=&start_month;
%do %while(&&date&i^=);
%if %substr(&&date&i,1,2)<12 %then %do;
%if %substr(&&date&i,1,2)<9 %then %let m=0%eval(%substr(&&date&i,1,2)+1);
%else %if %substr(&&date&i,1,2)>=9 %then %let m=%eval(%substr(&&date&i,1,2)+1);
%let y=%substr(&&date&i,7,4); %end;
%if %substr(&&date&i,1,2)=12 %then %do; %let m=01; %let y=%eval(%substr(&&date&i,7,4)+1); %end;
%let i=%eval(&i+1);
%if %eval(&y*100+&m)<=%eval(&e_yr*100+&e_mo) %then %do; %let date&i=&m./01/&y; %end;
%else %do; %let date&i=; %end;
%end;
%let nmonths=%eval(&i-1);
%do i=1 %to &nmonths;
%put date&i=&&date&i;
%end;
%mend make_data;
%make_data(start_month=01/01/2008,end_month=12/01/2009);
|
Parse apart variables in a macro variable |
---|
%let vars=dog cat pig fish;
* WAY 1;
%let i=1;
%let var&i=%scan(&vars,&i,' ');
%do %while(&&var&i^=);
%let i=%eval(&i+1);
%let var&i=%scan(&vars,&i,' ');
%end;
%let nvar=%eval(&i-1);
* WAY 2;
%let nvar=%sysfunc(countw(%superq(vars)));
%do i =1 %to &nvar;
%let var&i=%qscan(%superq(vars),&i,%str( ));
%end;
* Show them! ;
%do i=1 %to &nvar;
%put var&i=&&var&i;
%end;
|
Get the sample size of a dataset |
---|
%let indata=a.dataset;
ods listing close;
ods output Attributes=attr;
proc contents data=&indata;
run;
ods listing;
data _null_;
set attr;
if Label2="Observations" then call symput("nrows",nValue2);
run;
%put nrows=&nrows;
|
Determine library name from a data set input via a macro variable |
---|
%let indata=a.dataset;
%if %index(&indata,.)>0 %then %let libname=%substr(&indata,1,%index(&indata,.)-1);
%if %index(&indata,.)=0 %then %let libname=work;
%put Libname is &libname;
|
Determine the number of levels a variable has |
---|
%let indata=a.dataset;
%let var=cut_variable;
proc sql noprint;
select count(*) into :totlevel
from (select distinct &var from &indata);
%let numlevel=%eval(&totlevel);
%put ==> &var has &numlevel levels;
|
Remove quotes from macro variables |
---|
%let var = "'test1', 'test2', 'test3'";
%let val=%sysfunc(tranwrd(%quote(&var),%str(%"),%str()));
%put &val;
|
Pull a random sample from a data set |
---|
* Without replacement;
*****using surveyselect;
proc surveyselect data=hsb25 method=SRS rep=1 sampsize=10 seed=12345 out=hsbs1;
id _all_;
run;
*****not using surveyselect;
data hsbs1(drop=obsleft sampsize);
sampsize=10;
obsleft=totobs;
do while(sampsize>0);
pickit+1;
if ranuni(0)<sampsize/obsleft then do;
set hsbs25 point=pickit
nobs=totobs;
output;
sampsize=sampsize-1;
end;
obsleft=obsleft-1;
end;
stop;
run;
* With replacement;
*****using surveyselect;
proc surveyselect data=hsb25 method=URS rep=1 sampsize=10 seed=12345 out=hsbs1;
id _all_;
run;
*****not using surveyselect;
data hsbs1 (drop=i sampsize);
sampsize=10;
do i=1 to sampsize;
pickit=ceil(ranuni(0)*totobs);
set hsb25 point=pickit
nobs=totobs;
output;
end;
stop;
run;
|
Cool code to get Historical Stock Data on a Unix box with SAS |
---|
%macro get_stock(stocks=,indata=);
%let i=1;
%let stock&i=%scan(&stocks,&i,' ');
%do %while(&&stock&i^=);
%let i=%eval(&i+1);
%let stock&i=%scan(&stocks,&i,' ');
%end;
%let nstocks=%eval(&i-1);
%do i=1 %to &nstocks;
data _null_;
call system("wget 'http://ichart.finance.yahoo.com/table.csv?s=%trim(&&stock&i)%BQUOTE(') -O historical_stock_prices.csv");
run;
* note you can also use: http://www.google.com/finance/historical?output=csv&q=[Symbol name] ;
data temp&i;
infile "historical_stock_prices.csv" delimiter="," firstobs=2;
input cdate $10. Open_&&stock&i High_&&stock&i Low_&&stock&i Close_&&stock&i Volume_&&stock&i Adj_Close_&&stock&i;
ndate=mdy(input(substr(cdate,6,2),best10.),input(substr(cdate,9,2),best10.),input(substr(cdate,1,4),best10.));
format ndate mmddyy10.;
run;
proc sort data=temp&i;
by ndate;
run;
%end;
data &indata;
merge %do i=1 %to &nstocks; temp&i %end;;
by ndate;
run;
option nocenter;
proc print data=&indata;
run;
data _null_;
call system("rm historical_stock_prices.csv");
run;
%mend get_stock;
%get_stock(stocks=BAC AMZN,indata=D.stockdata);
|
Back to Rick McFarland's Library