-
Notifications
You must be signed in to change notification settings - Fork 8
/
dbcon.sas
294 lines (269 loc) · 9.16 KB
/
dbcon.sas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
%macro dbcon
/*----------------------------------------------------------------------
Summarize the contents of a dataset.
----------------------------------------------------------------------*/
(data /* Dataset name */
,maxchar=40 /* Maximum length of character variables */
,maxobs=100000 /* Maximum observations before using sampling */
,select= /* Variable names to select for analysis */
,exclude= /* Variable names to exclude from analysis */
,outval=_dbvals /* Dataset name for values output */
,outsum=_dbvars /* Dataset name for variable summary output */
,fname= /* Fileref or filename in quotes for text file */
,nval=10 /* Number of distinct values to print */
,printn=0 /* Include value frequency when text file is made */
);
/*----------------------------------------------------------------------
Calls: parmv.sas qlist.sas contentv.sas nobs.sas
Usage notes:
This macro will potentially generate a very large work dataset to
contain all of the possible values. To reduce the size of this dataset
and reduce processing time you can use these options:
- Use the SELECT and/or EXCLUDE options to limit the variables that
will be summarized.
- Use the MAXCHAR option to truncate long character variables.
- Set MAXOBS to a value > 0. This will cause macro to sample the
dataset when there are more than MAXOBS observations.
----------------------------------------------------------------------*/
%local macro parmerr nobs sample varlist maxlen flen trunc anynum anychar;
%let macro=&sysmacroname;
%parmv(data,_req=1)
%parmv(nval,_val=positive,_def=10)
%parmv(maxchar,_val=nonnegative,_def=40)
%parmv(exclude,_words=1)
%parmv(select,_words=1)
%parmv(outval,_req=1)
%parmv(outsum,_req=1)
%parmv(maxobs,_val=nonnegative)
%parmv(printn,_val=0 1)
%if (&parmerr) %then %goto quit;
%nobs(&data);
%if (&nobs < 0 ) %then %do;
%parmv(DATA,_msg=Dataset not found)
%goto quit;
%end;
*----------------------------------------------------------------------;
* Get attributes for variables in the input dataset ;
*----------------------------------------------------------------------;
%contentv(&data,out=dsinfo,genfmt=0)
*----------------------------------------------------------------------;
* Reduce to variables of interest and find maximum length char var ;
*----------------------------------------------------------------------;
proc sql noprint;
create table dsinfo as
select libname,memname,name,varnum,type,length,format,label
from dsinfo
where 1
%if %length(&select) %then
& upcase(name) in %qlist(&select)
;
%if %length(&exclude) %then
& upcase(name) ^in %qlist(&exclude)
;
order name
;
select name into :varlist separated by ' ' from dsinfo;
select max(length),max(type='num'),max(type='char')
into :maxlen,:anynum,:anychar
from dsinfo
;
quit;
%if (^%length(&varlist)) %then %do;
%parmv(_msg=No variables selected from &data)
%goto quit;
%end;
%let sample=0;
%if (&maxobs and (&nobs > &maxobs)) %then %do;
*----------------------------------------------------------------------;
* NOTE: When dataset is large then only take a sample of the data ;
*----------------------------------------------------------------------;
%let sample=%sysfunc(ceil(&nobs/&maxobs));
%put NOTE: &data has &nobs observations. This macro will only read
every &sample.th observation.;
%end;
%*---------------------------------------------------------------------
When MAXCHAR is specified then use instead of the actual max length.
Set TRUNC flag to indicate that values have been truncated.
-----------------------------------------------------------------------;
%let trunc=0;
%if (&maxchar and (&maxchar < &maxlen)) %then %do;
%let trunc=1;
%let maxlen=&maxchar;
%end;
*----------------------------------------------------------------------;
* Get values for all variables into tall skinny table. ;
*----------------------------------------------------------------------;
data &outval(rename=(__name=name __cvalue=cvalue __value=value));
set &data(keep=&varlist);
keep __name __cvalue __value;
%if (&sample) %then %do;
if (mod(_n_,&sample) = 0);
%end;
%if (&anychar) %then %do;
array _c _character_;
%end;
%if (&anynum) %then %do;
array _n _numeric_;
%end;
length __name $32 __value 8 __cvalue $&maxlen;
format __value best12.;
%if (&anychar) %then %do;
__value=.;
do _i = 1 to dim(_c);
call vname(_c{_i},__name);
__name = upcase(__name);
__cvalue = _c{_i};
output;
end;
%end;
%if (&anynum) %then %do;
__cvalue = '';
do _i = 1 to dim(_n);
call vname(_n{_i},__name);
__name = upcase(__name);
__value = _n{_i};
output;
end;
%end;
run;
*----------------------------------------------------------------------;
* Summarize to unique values and frequency. ;
*----------------------------------------------------------------------;
proc summary data=&outval nway missing;
class name value cvalue / groupinternal;
output out=&outval(keep=name value cvalue _freq_);
run;
%if (^&trunc) %then %do;
*----------------------------------------------------------------------;
* Calculate maximum actual length of character variables ;
*----------------------------------------------------------------------;
proc sql noprint;
create table _maxlen as
select name,max(length(cvalue)) as maxlen
from &outval
where cvalue ne ' '
group by name
;
quit;
%end;
*----------------------------------------------------------------------;
* Count number of unique values for each name.;
*----------------------------------------------------------------------;
proc summary data=&outval nway;
by name;
var _freq_;
output max=maxfreq
out=&outsum(keep=name _freq_ maxfreq rename=(_freq_=nval))
;
run;
*----------------------------------------------------------------------;
* Combine the attribute information with number of distinct values and ;
* keep the actual first value. ;
*----------------------------------------------------------------------;
data &outsum;
merge
dsinfo
&outsum
&outval(drop=_freq_)
%if (^&trunc) %then _maxlen;
;
by name;
if first.name;
if nval=1 and ((type='char' and cvalue=' ') or
(type='num' and value=.)) then nval=0;
label value='First value (numeric)'
cvalue='First value (character)'
nval ='Number of distinct values (0=all missing)'
maxfreq='Frequency of most frequent value'
maxlen='Maximum length of character data'
;
%if (&trunc) %then %do;
maxlen=.;
%end;
run;
%if (%length(&fname)) %then %do;
/*----------------------------------------------------------------------
When FNAME is supplied then write summary to output text file.
For names with more than &NVAL unique values, only list &nval values
(half each from the start and the end) are listed.
When PRINTN is requested then frequency count for each value will be
printed before the values.
When formatted value is different than the raw value then the formatted
value is printed after the raw value.
----------------------------------------------------------------------*/
%*----------------------------------------------------------------------
Round NVAL up to next even number
-----------------------------------------------------------------------;
%if %sysfunc(mod(&nval,2)) %then %do;
%let nval=%eval(&nval+1);
%put NOTE: Rounding NVAL up to even number &nval..;
%end;
%if (&printn) %then %do;
*----------------------------------------------------------------------;
* Determine characters needed to display frequencies using COMMA format;
*----------------------------------------------------------------------;
proc sql noprint;
select max(maxfreq) into :flen from &outsum;
quit;
%let flen=%sysfunc(max(&flen,&flen),comma32.);
%let flen=&flen;
%let flen=%length(&flen);
%end;
data _null_;
merge &outsum &outval;
by name;
file &fname;
if (_n_=1) then put
'~' 69*'='
/ memname "NOBS=&nobs [Display limited to &nval values]"
%if (&trunc) %then " <Max length &maxlen>" ;
/ 70*'='
;
if (first.name) then do;
k = 1;
%if (&printn) %then %do;
put @&flen 'N ' @;
%end;
put name 'LEN=' @;
if type='char' then put '$'@;
put length nval= @;
if maxlen ne . then put maxlen= @;
if format ne ' ' then put format= @;
if name ne label and label ne ' ' then put label= @;
put ;
if nval then put 70*'-';
end;
else k + 1;
if nval=0 then out=0;
else if nval<= &nval+1 then out=1;
else out = (k <= ceil(&nval/2)) | (nval - k < ceil(&nval/2));
if (out);
%if (&printn) %then %do;
put _freq_ comma&flen.. +1 @;
%end;
if (type = 'num') then do;
if (format = '') then put value best12.;
else do;
cvalue = putn(value,format);
if (input(cvalue,??best12.) ^= value) then put value best12. +1 cvalue;
else put value best12.;
end;
end;
else do;
put cvalue @;
if (compress(format,'$1234567890.') ^= '') then do;
fcvalue = left(putc(cvalue,format));
if (fcvalue ^= cvalue) then put @&maxlen+1 fcvalue;
else put;
end;
else put;
end;
if (nval > &nval+1) & (k = ceil(&nval/2)) then put
%if (&printn) %then &flen*'.' '.' ;
'............'
;
if (last.name) then put 70*'_';
run;
%end;
%quit:
%mend dbcon;