-
Notifications
You must be signed in to change notification settings - Fork 18
/
crunch.c
3194 lines (2861 loc) · 116 KB
/
crunch.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* character-set based brute-forcing algorithm
* Copyright (C) 2004 by [email protected]
* Copyright (C) 2008, 2009, 2010, 2011, 2012 by [email protected]
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, version 2 only of the License
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* crunch version 1.0 was written by [email protected]
* all later versions of crunch have been updated by [email protected]
*
* Changelog:
* version 1.0 initial release in 2004 (guess based on copyright date) 194 lines of code
* version 1.1 added support to read character list from a specified file
* version 1.2 added output parameter
* version 1.3 added -c parameter to break apart output file. I would prefer to break
* apart on file size instead of number of lines, but there is no reliable
* way to get that information while the file is being created.
* version 1.4 added better help documentation and fixed some bugs:
* space character was sometimes getting stripped off in charset
* replace if (pattern[0]==0) { with if ((strncmp(&pattern[0],"\0",1)==0)) {
* to make splint happy
* rename templ variable to pattern
* changed some fixed size local buffers to dynamically allocated buffers
* if max-len was larger than strlen of the -t parameter crunch would generate
* duplicate words
* document some of the code
* version 1.5 fixed length and -s parameter. Before if you did ./crunch 1 3 -s
* you would only get j-z and not j-zzz
* converted some fixed length buffers to variable length
* added checks to fclose
* fixed it so that -s and -t can now work together
* added support to generate wordlists greater than 2GB even though several
* programs can't access files of this size (john the ripper for example)
* This support works on Ubuntu intrepid. This probably works on most Linux
* platforms. Support for Solaris, HP-UX, Windows, BSD is unknown.
* version 1.6 add option (-p) to support factorial instead of exponentail processing. i.e.
* if you did ./crunch 3 3 abc you would get 27 results (3x3x3): a through ccc
* This new option will give you 6 results (3! = 3x2x1)
* i.e. (abc, acb, bac, bca, cab, cba)
* the permute function was written by Richard Heathfield
* and copied from
* http://bytes.com/groups/c/536779-richard-heathfields-tsp-permutation-algorithm
* I did change temp from an int to char to make splint happy.
* Richard Heathfield may be contacted at [email protected]
* version 1.7 add option (-m) to support factoring words instead of characters
* version 1.8 add option (-z) to support compressing the output
* version 1.9 only need permute2 (remove permute and renamed permute2 to permute)
* fix off by 1 error - reported by MikeCa
* fix issue where output file wasn't being renamed
* fork progress process
* really fix it so that -s and -t can now work together
* when using -c sometimes the output was 1 larger than requested
* cstring_cmp from http://www.anyexample.com/programming/c/qsort__sorting_array_of_strings__integers_and_structs.xml to sort words to permute so the results will be in sorted order
* version 2.0 permute now supports -c -o & -z. -f is also supported for permuting letters
* really really fix it so that -s and -t can now work together
* added null checks for parameter values to prevent segmentation faults
* added support to invert chunk output - written by Pillenski
* added support to breakup file based on size - written by Pillenski
* version 2.1 permute now supports -b
* add signal handling - pressing ctrl break will cause crunch
* to finish the word it is processing and then gracefully exit instead
* of leaving files open and words half finished. This is the first
* step in supporting resume.
* chunk now supports resume
* version 2.2 pattern supports numbers and symbols
* version 2.3 fix bytecount
* add new options to usage
* version 2.4 fix usage (-m and not -n) - reported by Tape
* clarified -b option in help and man file - reported by Tape
* fix Makefile install to /pentest/passwords not /pentest/password -
* reported by Tape
* make bytecount behave consistently - reported by Tape
* fixed up copyrights
* version 2.5 -q permute supports reading from strings from file
* sorted parameters in code and usage
* -t supports upper and lower case letters
* -f supports multiple charsets
* add -v option to show version information
* add correct gpl.txt (version 2 not 3)
* fix charset.lst (some symbol14 had 15 chars)
* add symbol14, symbol14-space, symbol-all, and symbol-all-space to charset.lst
* removed examples and parameters from usage. I added a sentence
* telling people to refer to the man page. It is easier to update
* the man only.
* combined -m and -p into -p
* got rid of some unnecessary casts
* changed all mallocs to callocs, this shuts up a few splint warnings
* version 2.6 fix -p seg fault - reported by Tape
* improve -p documentation - reported by Tape
* fix Makefile to install to correct directory again - reported by Tape
* fix Makefile to install charset.lst - reported by Tape
* fix memory leak
* replace if (argv[i+1] != NULL) with if (i+1 < argc) as argv[i+1]
* can be filled garbage so it is not an accurate test
* fix an off by 1 in resume counter
* resume now respects the -b parameter, previously it was ignored
* -s now supports the @*%^ symbols in -t
* added status report when saving to file
* renamed some variables to better names
* added a few comments to variables
* added a hex string 0123456789abcdef to charset.lst
* version 2.7 fix progress bar when using -s
* fixed typo man file - Thanks Tape
* add -u option to suppress all filesize and linecount so crunch
* can feed another program
* fork a child process for progress %%
* Niclas Kroon added swedish characters to the charset.lst
* permute supports -t
* added -std=c99 to Makefile since unsigned long long isn't in c89
* ran valgrind and fixed a small memory issue. I forgot to allocate
* space for the \0 character in a few places. Doh!
* improved documentation of the charset field
* version 2.8 fix progress message. It could cause a fatal error under certain
* circumstances
* version 2.9 fix divide by zero error.
* version 3.0 fix wrong version number
* changed the * character of -t option to a , (comma) as * is a reserved character
* strip off blank lines when reading file for permute (-q option)
* I fixed a problem with using -i and -t together
* add -l to allow for literal characters of @,%^
* fix -b and -c and % output
* fix permute -t to work with -b and -c
* fixed crash when / character was in filename
* replace / with space - reported by Nazagul
* version 3.0.1 fix printpermutepattern - it was using $ instead of ,
* version 3.1 make -l take a string so you can have @ and characters
* add TB and PB to output size
* fix comments referencing $ that should be ,
* add -e to end generation after user specified string (useful when piping crunch to a program)
* version 3.2 add -d to limit duplicate characters
* put correct function name into error messages to help with debugging
* fix Makefile uninstall to remove crunch directory and install GPL.TXT
* removed flag5 as it wasn't needed
* if you press Ctrl-C crunch will now print where it stops so you can resume piping crunch into another program
* version 3.3 add more information to help section
* Fixed mem leaks, invalid comparisons - fixed by JasonC
* Error messages and startup summary now go to stderr (-u now unnecessary) - fixed by JasonC
* Fixed startup delay due to long sequences of dupe-skipped strings - fixed by JasonC
* Added unicode support - written by JasonC
* fix write and compress error - reported and fixed by amontero
* fix printpercentage -> linecounter should be ->linetotal
* add support for 7z
* version 3.4 fix -e problem reported by hajjid
* test compile using Ubuntu 12.10 and fixed the following issues:
* reorder flags in Makefile so crunch can compile successfully
* remove finall variable from printpercentage
* remove loaded from main
*
* TODO: Listed in no particular order
* add resume support to permute (I am not sure this is possible)
* make permute more intelligent (min, max) (I am not sure this is possible either)
* support SIGINFO when Linux supports it, use SIGUSR1 until SIGINFO is available
* finalbytecount isn't currently correct for unicode chars unless -p used
* let user specify placeholder characters (@,%^)
* add date support?
* specify multiple charset names using -f i.e. -f charset.lst + ualpha 123 +
* make permute use -e
* revamp compression part of renamefile 7z doesn't delete original file
* maybe fork compression part of renamefile
* size calculations are wrong when min or max is larger than 12
* newer gcc complains about pidret as not being used
*
* usage: ./crunch <min-len> <max-len> [charset]
* e.g: ./crunch 3 7 abcdef
*
* This example will compute all passwords between 3 and 7 chars
* using 'abcdef' as the character set and dump it to stdout.
*
* usage: ./crunch <from-len> <to-len> [-f <path to charset.lst> charset-name] [-o wordlist.txt or START] [-t [FIXED]@@@@] [-s startblock]
*
* Options:
* -b : maximum bytes to write to output file. depending on the blocksize
* files may be some bytes smaller than specified but never bigger.
* -c : numbers of lines to write to output file, only works if "-o START"
* is used, eg: 60 The output files will be in the format of starting
* letter - ending letter for example:
* crunch 1 5 -f /pentest/password/charset.lst mixalpha -o START -c 52
* will result in 2 files: a-7.txt and 8-\ .txt The reason for the
* slash in the second filename is the ending character is space and
* ls has to escape it to print it. Yes you will need to put in
* the \ when specifying the filename.
* -d : specify -d [n][@,%^] to suppress generation of strings with more
* than [n] adjacent duplicates from the given character set. For example:
* ./crunch 5 5 -d 2@
* Will print all combinations with 2 or less adjacent lowercase duplicates.
* -e : tells crunch to stop generating words at string. Useful when piping
* crunch to another program.
* -f : path to a file containing a list of character sets, eg: charset.lst
* name of the character set in the above file eg:
* mixalpha-numeric-all-space
* -i : inverts the output so the first character will change very often
* -l : literal characters to use in -t @,%^
* -o : allows you to specify the file to write the output to, eg:
* wordlist.txt
* -p : prints permutations without repeating characters. This option
* CANNOT be used with -s. It also ignores min and max lengths.
* -q : Like the -p option except it reads the strings from the specified
* file. It CANNOT be used with -s. It also ignores min and max.
* -r : resume a previous session. You must use the same command line as
* the previous session.
* -s : allows you to specify the starting string, eg: 03god22fs
* -t [FIXED]@,%^ : allows you to specify a pattern, eg: @@god@@@@
* where the only the @'s will change with lowercase letters
* the ,'s will change with uppercase letters
* the %'s will change with numbers
* the ^'s will change with symbols
* -u : only print words; supress file size information, aka unheard
* NOT NEEDED ANYMORE
* -z : adds support to compress the generated output. Must be used
* with -o option. Only supports gzip, bzip, lzma, and 7z.
*
* This code can be easily adapted for use in brute-force attacks
* against network services or cryptography.
*
* Compiles on: linux (32 and 64 bit Ubuntu for sure, 32 and 64 bit Linux in
* general works. I have received word that crunch compiles on MacOS.
* It should compile on freebsd and the other Unix and Linux OSs but I don't
* don't have access to any of the those systems. Please let me know.
*/
#include <assert.h>
#include <locale.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <wchar.h>
#include <ctype.h>
#include <errno.h>
#include <signal.h>
#include <math.h>
#include <pthread.h>
#include <unistd.h>
#include <limits.h>
#include <sys/wait.h>
#include <sys/types.h>
/* largest output string */
#define MAXSTRING 128
/* longest character set */
#define MAXCSET 256
/* invalid index for size_t's */
#define NPOS ((size_t)-1)
static const wchar_t def_low_charset[] = L"abcdefghijklmnopqrstuvwxyz";
static const wchar_t def_upp_charset[] = L"ABCDEFGHIJKLMNOPQRSTUVWXYZ";
static const wchar_t def_num_charset[] = L"0123456789";
static const wchar_t def_sym_charset[] = L"!@#$%^&*()-_+=~`[]{}|\\:;\"'<>,.?/ ";
static const char version[] = "3.4";
static size_t inc[128];
static size_t numofelements = 0;
static size_t inverted = 0; /* 0 for normal output 1 for aaa,baa,caa,etc */
static unsigned long long linecount = 0; /* user specified break output into count lines */
static unsigned long long bytecount = 0 ; /* user specified break output into size */
static volatile sig_atomic_t ctrlbreak = 0; /* 0 user did NOT press Ctrl-C 1 they did */
static FILE *fptr; /* file pointer */
static int output_unicode = 0; /* bool. If nonzero, all output will be unicode. Can be set even if non-unicode input*/
/* When unicode input is given, the code cannot [currently] always calculate
the exact final file size. In those cases this var simply suppresses
the output. For what it's worth, the calculated size is always the minimum size. */
static int suppress_finalsize = 0; /* bool */
/*
Need a largish global buffer for converting wide chars to multibyte
to avoid doing an alloc on every single output line
leading to heap frag. Only alloc'd once at startup.
Size is MAXSTRING*MB_CUR_MAX+1
*/
static char* gconvbuffer = NULL;
static size_t gconvlen = 0;
struct thread_data{
unsigned long long finalfilesize; /* total size of output */
unsigned long long bytetotal; /* total number of bytes so far */
unsigned long long bytecounter; /* count number of bytes in output resets to 0 */
unsigned long long finallinecount; /* total size of output */
unsigned long long linetotal; /* total number of lines so far */
unsigned long long linecounter; /* counts number of lines in output resets to 0 */
};
/* pattern info */
struct pinfo {
wchar_t *cset; /* character set pattern[i] is member of */
size_t clen;
int is_fixed; /* whether pattern[i] is a fixed value */
size_t start_index, end_index; /* index into cset for the start and end strings */
size_t duplicates;
};
/* program options */
struct opts_struct {
wchar_t *low_charset;
wchar_t *upp_charset;
wchar_t *num_charset;
wchar_t *sym_charset;
size_t clen, ulen, nlen, slen;
wchar_t *pattern;
size_t plen;
wchar_t *literalstring;
wchar_t *startstring;
wchar_t *endstring;
size_t duplicates[4]; /* allowed number of duplicates for each charset */
size_t min, max;
wchar_t *last_min; /* last string of length min */
wchar_t *first_max; /* first string of length max */
wchar_t *min_string;
wchar_t *max_string; /* either startstring/endstring or calculated using the pattern */
struct pinfo *pattern_info; /* information generated from pattern */
};
typedef struct opts_struct options_type;
static struct thread_data my_thread;
static int wcstring_cmp(const void *a, const void *b) {
const wchar_t **ia = (const wchar_t **)a;
const wchar_t **ib = (const wchar_t **)b;
return wcscmp(*ia, *ib);
}
static void ex_program() {
ctrlbreak = 1;
(void) signal(SIGINT, SIG_DFL);
}
static size_t force_wide_string(wchar_t *wout, const char *s, size_t n) {
size_t i;
const unsigned char *ucp = (const unsigned char*)s;
size_t slen = strlen(s);
/*
Blindly convert all characters to the numerically equivalent wchars.
This is intended to be used after a call to mbstowcs fails.
Like mbstowcs(), output may not be null terminated if returns n
*/
for (i=0; i<n && i<slen; ++i) {
wout[i] = (wchar_t)ucp[i];
}
if (i<n)
wout[i] = 0;
return i;
}
static size_t make_wide_string(wchar_t *wout, const char *s, size_t n, int* r_is_unicode) {
size_t stres;
const char* cp;
int contains_upp128 = 0;
/*
If 's' contains a UTF-8 string which is not plain 7bit,
is_unicode is set nonzero and wout contains the proper wide string.
Otherwise the code points are assumed to be the exact values in s.
Unlike mbstowcs, result is always null terminated as long as
n is nonzero.
Leave r_is_unicode undisturbed unless setting to nonzero!
(must never be changed from 1 to 0 regardless of this call's data)
*/
for (cp = s; *cp; ++cp) {
if ((int)*cp < 0) {
contains_upp128 = 1;
break;
}
}
stres = mbstowcs(wout,s,n);
if (stres != NPOS) {
if (contains_upp128 && r_is_unicode)
*r_is_unicode = 1;
}
else
stres = force_wide_string(wout,s,n);
if (n != 0)
wout[n-1]=0;
return stres;
}
static wchar_t *alloc_wide_string(const char *s, int* r_is_unicode) {
wchar_t* wstr = NULL;
size_t len = s ? strlen(s)+1 : 1;
wstr = (wchar_t*)malloc(len*sizeof(wchar_t));
if (!wstr) {
fprintf(stderr,"alloc_wide_string: Can't allocate mem!\n");
exit(EXIT_FAILURE);
}
(void)make_wide_string(wstr,s?s:"",len,r_is_unicode);
return wstr;
}
static size_t force_narrow_string(char *out, const wchar_t* src, size_t n) {
size_t i;
for (i=0; i<n && src[i]!=L'\0'; ++i) {
out[i] = (char)(src[i]&0xFF);
}
if (i<n)
out[i] = '\0';
return i;
}
static size_t make_narrow_string(char *out, const wchar_t* src, size_t n) {
size_t retval;
/*
If global output_unicode is true, src is converted to a UTF-8 string.
If not, the low 8 bits are copied to the output string
which is most definitely not what you want unless the input
wasn't unicode in the first place.
Unlike wcstomb(), output is always null terminated as long as n!=0
*/
if (output_unicode != 0) {
retval = wcstombs(out, src, n);
if (retval == NPOS) {
fprintf(stderr,"Error: wcstombs() failed. This shouldn't have happened.\n");
exit(EXIT_FAILURE);
}
}
else
retval = force_narrow_string(out, src, n);
if (n!=0)
out[n-1]='\0';
return retval;
}
static int getmblen(wchar_t wc) {
/* returns number of bytes required for wide char wc, taking into account current setting of global output_unicode */
int mblen = 1;
if (output_unicode) {
char mb[MB_CUR_MAX+1];
mblen = wctomb(mb,wc);
if (mblen == -1) {
fprintf(stderr,"Warning: wctomb failed for char U+%04lX\n",(unsigned long)wc);
mblen = 1;
}
}
return mblen;
}
static wchar_t *dupwcs(const wchar_t *s) {
/* replacement for wcsdup, where not avail (it's POSIX) */
size_t n;
wchar_t *p = NULL;
if (s != NULL)
{
n = (1 + wcslen(s)) * sizeof(wchar_t);
if ((p = (wchar_t*)malloc(n))!=NULL)
memcpy(p, s, n);
}
return p;
}
/* return 0 if string1 does not comply with options.pattern and options.literalstring */
static int check_member(const wchar_t *string1, const options_type* options) {
const wchar_t *cset;
size_t i;
for (i = 0; i < wcslen(string1); i++) {
cset = NULL;
switch (options->pattern[i]) {
case L'@':
if (options->literalstring[i] != L'@')
cset = options->low_charset;
break;
case L',':
if (options->literalstring[i] != L',')
cset = options->upp_charset;
break;
case L'%':
if (options->literalstring[i] != L'%')
cset = options->num_charset;
break;
case L'^':
if (options->literalstring[i] != L'^')
cset = options->sym_charset;
break;
default: /* constant part of pattern */
break;
}
if (cset == NULL) {
if (string1[i] != options->pattern[i])
return 0;
continue;
}
while (*cset)
if (string1[i] == *cset)
break;
else
cset++;
if (*cset == L'\0')
return 0;
}
return 1;
}
/* NOTE: similar to strpbrk but length limited and only searches for a single char */
static size_t find_index(const wchar_t *cset, size_t clen, wchar_t tofind) {
size_t i;
for (i = 0; i < clen; i++)
if (cset[i] == tofind)
return i;
return NPOS;
}
static void fill_minmax_strings(options_type *options) {
size_t i;
wchar_t *last_min; /* last string of size min */
wchar_t *first_max; /* first string of size max */
wchar_t *min_string, *max_string; /* first string of size min, last string of size max */
last_min = calloc(options->min + 1,sizeof(wchar_t));
if (last_min == NULL) {
fprintf(stderr,"fill_minmax_strings: can't allocate memory for last_min\n");
exit(EXIT_FAILURE);
}
last_min[options->min] = L'\0';
first_max = calloc(options->max + 1,sizeof(wchar_t));
if (first_max == NULL) {
fprintf(stderr,"fill_minmax_strings: can't allocate memory for first_max\n");
exit(EXIT_FAILURE);
}
first_max[options->max] = L'\0';
min_string = calloc(options->min + 1,sizeof(wchar_t));
if (min_string == NULL) {
fprintf(stderr,"fill_minmax_strings: can't allocate memory for min_string\n");
exit(EXIT_FAILURE);
}
min_string[options->min] = L'\0';
max_string = calloc(options->max + 1,sizeof(wchar_t));
if (max_string == NULL) {
fprintf(stderr,"fill_minmax_strings: can't allocate memory for max_string\n");
exit(EXIT_FAILURE);
}
max_string[options->max] = L'\0';
/* fill last_min and first_max */
for (i = 0; i < options->max; i++) {
if (options->pattern == NULL) {
if (i < options->min) {
last_min[i] = options->low_charset[options->clen-1];
min_string[i] = options->low_charset[0];
}
first_max[i] = options->low_charset[0];
max_string[i] = options->low_charset[options->clen-1];
}
else { /* min == max */
min_string[i] = max_string[i] = last_min[i] = first_max[i] = options->pattern[i];
switch (options->pattern[i]) {
case L'@':
if (options->literalstring[i] != L'@') {
max_string[i] = last_min[i] = options->low_charset[options->clen - 1];
min_string[i] = first_max[i] = options->low_charset[0];
}
break;
case L',':
if (options->literalstring[i] != L',') {
max_string[i] = last_min[i] = options->upp_charset[options->ulen - 1];
min_string[i] = first_max[i] = options->upp_charset[0];
}
break;
case L'%':
if (options->literalstring[i] != L'%') {
max_string[i] = last_min[i] = options->num_charset[options->nlen - 1];
min_string[i] = first_max[i] = options->num_charset[0];
}
break;
case L'^':
if (options->literalstring[i] != L'^') {
max_string[i] = last_min[i] = options->sym_charset[options->slen - 1];
min_string[i] = first_max[i] = options->sym_charset[0];
}
break;
default:
break;
}
}
}
options->last_min = last_min;
options->first_max = first_max;
if (options->startstring)
for (i = 0; i < options->min; i++)
min_string[i] = options->startstring[i];
if (options->endstring)
for (i = 0; i < options->max; i++)
max_string[i] = options->endstring[i];
options->min_string = min_string;
options->max_string = max_string;
}
static void fill_pattern_info(options_type *options) {
struct pinfo *p;
wchar_t *cset;
size_t i, clen, index, si, ei;
int is_fixed;
size_t dupes;
options->pattern_info = calloc(options->max, sizeof(struct pinfo));
if (options->pattern_info == NULL) {
fprintf(stderr,"fill_pattern_info: can't allocate memory for pattern info\n");
exit(EXIT_FAILURE);
}
for (i = 0; i < options->max; i++) {
cset = NULL;
clen = 0;
index = 0;
is_fixed = 1;
dupes = options->duplicates[0];
if (options->pattern == NULL) {
cset = options->low_charset;
clen = options->clen;
is_fixed = 0;
}
else {
cset = NULL;
switch (options->pattern[i]) {
case L'@':
if (options->literalstring[i] != L'@') {
cset = options->low_charset;
clen = options->clen;
is_fixed = 0;
dupes = options->duplicates[0];
}
break;
case L',':
if (options->literalstring[i] != L',') {
cset = options->upp_charset;
clen = options->ulen;
is_fixed = 0;
dupes = options->duplicates[1];
}
break;
case L'%':
if (options->literalstring[i] != L'%') {
cset = options->num_charset;
clen = options->nlen;
is_fixed = 0;
dupes = options->duplicates[2];
}
break;
case L'^':
if (options->literalstring[i] != L'^') {
cset = options->sym_charset;
clen = options->slen;
is_fixed = 0;
dupes = options->duplicates[3];
}
break;
default: /* constant part of pattern */
break;
}
}
if (cset == NULL) {
/* fixed character. find its charset and index within. */
cset = options->low_charset;
clen = options->clen;
dupes = options->duplicates[0];
if (options->pattern == NULL) {
fprintf(stderr,"fill_pattern_info: options->pattern is NULL!\n");
exit(EXIT_FAILURE);
}
if ((index = find_index(cset, clen, options->pattern[i])) == NPOS) {
cset = options->upp_charset;
clen = options->ulen;
dupes = options->duplicates[1];
if ((index = find_index(cset, clen, options->pattern[i])) == NPOS) {
cset = options->num_charset;
clen = options->nlen;
dupes = options->duplicates[2];
if ((index = find_index(cset, clen, options->pattern[i])) == NPOS) {
cset = options->sym_charset;
clen = options->slen;
dupes = options->duplicates[3];
if ((index = find_index(cset, clen, options->pattern[i])) == NPOS) {
cset = NULL;
clen = 0;
dupes = (size_t)-1;
index = 0;
}
}
}
}
si = ei = index;
}
else {
/*
si = find_index(cset, clen, options->min_string[i]);
*/
/* That can't be right. Would only work when we have a pattern
or min==max, neither is guaranteed here. Or is it? -jC */
if (i < wcslen(options->min_string))
si = find_index(cset, clen, options->min_string[i]);
else
si = 0;
ei = find_index(cset, clen, options->max_string[i]);
if (si == NPOS || ei == NPOS) {
fprintf(stderr,"fill_pattern_info: Internal error: "\
"Can't find char at pos #%lu in cset\n",
(unsigned long)i+1);
exit(EXIT_FAILURE);
}
}
p = &(options->pattern_info[i]);
p->cset = cset;
p->clen = clen;
p->is_fixed = is_fixed;
p->start_index = si;
p->end_index = ei;
p->duplicates = dupes;
}
#ifdef DEBUG
printf("pattern_info duplicates: ");
for (i = 0; i < options->max; i++) {
p = &(options->pattern_info[i]);
printf("(%d %d %d %d %d %d)", p->cset, p->clen, p->is_fixed, p->start_index, p->end_index, p->duplicates);
}
printf("\n");
#endif
}
static unsigned long long fill_next_count(size_t si, size_t ei, int repeats, unsigned long long sum, /*@null@*/ unsigned long long *current_count, /*@out@*/ unsigned long long *next_count, size_t len) {
size_t i;
unsigned long long nextsum = 0;
for (i = 0; i < si; i++)
next_count[i] = 0;
if (repeats == 0 || current_count == NULL)
nextsum = sum;
else if (repeats == 1)
for (; i <= ei; i++) {
next_count[i] = sum - current_count[i];
nextsum += next_count[i];
}
else /* repeats > 1 */
for (; i <= ei; i++) {
next_count[i] = current_count[i];
nextsum += next_count[i];
}
for (; i < len; i++)
next_count[i] = 0;
return nextsum;
}
/* calculate the number of strings from start to end taking into account duplicate removal worst case: O(2^n), could be improved using memoize */
static unsigned long long calculate_with_dupes(int start_point, int end_point, size_t first, size_t last, size_t pattern_index, int repeats, unsigned long long current_sum, /*@null@*/ unsigned long long *current_count, const options_type options, size_t plen) {
/* start_point and end_point are bools */
unsigned long long count[MAXCSET];
unsigned long long *next_count;
unsigned long long total = 0, sum = 0;
size_t start_index, end_index;
size_t i;
if (first == NPOS || last == NPOS || first > last || pattern_index > plen) /* sanity check */
return 0;
/* check for too many duplicates */
if (pattern_index > 0 && (size_t)repeats > options.pattern_info[pattern_index-1].duplicates) {
return 0;
}
/* create the result count for pattern_index - 1 */
if (pattern_index == 0) {
sum = current_sum;
next_count = NULL;
}
else if (pattern_index == 1 ||
(options.pattern_info[pattern_index-1].cset != NULL &&
options.pattern_info[pattern_index-2].cset != options.pattern_info[pattern_index-1].cset) ||
(options.pattern_info[pattern_index-1].cset == NULL &&
options.pattern[pattern_index-2] != options.pattern[pattern_index-1])) {
/* beginning new cset */
if (repeats > 1) {
return 0;
}
if (options.pattern_info[pattern_index-1].cset != NULL) {
for (i = 0; i < first; i++)
count[i] = 0;
for (; i <= last; i++)
count[i] = current_sum;
for (; i < options.pattern_info[pattern_index-1].clen; i++)
count[i] = 0;
next_count = count;
} else
next_count = NULL;
sum = current_sum * (last - first + 1);
}
else if (options.pattern_info[pattern_index-1].cset == NULL &&
options.pattern[pattern_index-2] == options.pattern[pattern_index-1]) {
/* continuing unknown cset */
sum = current_sum;
next_count = NULL;
}
else {
/* continuing previous cset */
sum = fill_next_count(first, last, repeats, current_sum, current_count, count, options.pattern_info[pattern_index-1].clen);
next_count = count;
}
if (sum == 0) {
return 0;
}
if (pattern_index == plen) { /* check for the end of the pattern */
return sum;
}
if (options.pattern_info[pattern_index].is_fixed) {
start_index = end_index = options.pattern_info[pattern_index].start_index;
}
else {
start_index = 0;
end_index = options.pattern_info[pattern_index].clen - 1;
if (start_point)
start_index = options.pattern_info[pattern_index].start_index;
if (end_point)
end_index = options.pattern_info[pattern_index].end_index;
}
if (start_index == end_index) {
/* [0..si..0](sp, ep) */
if (repeats > 0)
total += calculate_with_dupes(start_point, end_point, start_index, end_index, pattern_index + 1, repeats + 1, sum, next_count, options, plen);
total += calculate_with_dupes(start_point, end_point, start_index, end_index, pattern_index + 1, 1, sum, next_count, options, plen);
}
else {
if (start_point) {
/* [0..si..0](sp, 0) */
if (repeats > 0)
total += calculate_with_dupes(start_point, 0, start_index, start_index, pattern_index + 1, repeats + 1, sum, next_count, options, plen);
total += calculate_with_dupes(start_point, 0, start_index, start_index, pattern_index + 1, 1, sum, next_count, options, plen);
start_index++;
}
if (end_point) {
/* [0..ei..0](0, ep) */
if (repeats > 0)
total += calculate_with_dupes(0, end_point, end_index, end_index, pattern_index + 1, repeats + 1, sum, next_count, options, plen);
total += calculate_with_dupes(0, end_point, end_index, end_index, pattern_index + 1, 1, sum, next_count, options, plen);
end_index--;
}
if (start_index <= end_index) { /* middle */
/* [0..si..ei..0](0,0) */
if (repeats > 0)
total += calculate_with_dupes(0, 0, start_index, end_index, pattern_index + 1, repeats + 1, sum, next_count, options, plen);
total += calculate_with_dupes(0, 0, start_index, end_index, pattern_index + 1, 1, sum, next_count, options, plen);
}
}
return total;
}
/* calculate the number of strings from start to end O(n) */
static unsigned long long calculate_simple(const wchar_t *startstring, const wchar_t *endstring, const wchar_t *cset, size_t clen) {
unsigned long long total = 1;
size_t index1, index2;
for (; *endstring && *startstring; endstring++, startstring++) {
for (index1 = 0; cset[index1] != L'\0' && cset[index1] != *endstring;)
index1++;
if (cset[index1] == L'\0')
index1 -= 1;
for (index2 = 0; cset[index2] != L'\0' && cset[index2] != *startstring;)
index2++;
if (cset[index2] == L'\0')
index2 = 0;
total = (total - 1) * (unsigned long long)clen + (unsigned long long)index1 - (unsigned long long)index2 + 1;
}
return total;
}
/* calculate the number of strings from start to end, inclusive, taking into account the pattern O(n) */
static unsigned long long calculate_with_pattern(const wchar_t *startstring, const wchar_t *endstring, const options_type options) {
unsigned long long total_strings = 1;
size_t temp, clen;
const wchar_t *cset;
size_t index1, index2;
if (options.pattern == NULL) {
return calculate_simple(startstring, endstring, options.low_charset, options.clen);
}
for (temp = 0; temp < options.plen; temp++) {
cset = NULL;
clen = 0;
switch (options.pattern[temp]) {
case L'@':
if (options.literalstring[temp] != L'@') {
clen = options.clen;
cset = options.low_charset;
}
break;
case L',':
if (options.literalstring[temp] != L',') {
clen = options.ulen;
cset = options.upp_charset;
}
break;
case L'%':
if (options.literalstring[temp] != L'%') {
clen = options.nlen;
cset = options.num_charset;
}
break;
case L'^':
if (options.literalstring[temp] != L'^') {
clen = options.slen;
cset = options.sym_charset;
}
break;
default: /* constant part of pattern */
break;
}
if (cset) {
for (index1 = 0; index1 < clen && cset[index1] != endstring[temp];)
index1++;
if (index1 == clen)
index1 = clen - 1;
for (index2 = 0; index2 < clen && cset[index2] != startstring[temp];)
index2++;
if (index2 == clen)
index2 = 0;
total_strings = (total_strings - 1) * clen + (unsigned long long)index1 - (unsigned long long)index2 + 1;
}
}
return total_strings;
}
/* calculate the number of lines and bytes to output */
static void count_strings(unsigned long long *lines, unsigned long long *bytes, const options_type options) {
const wchar_t *startstring = options.startstring, *endstring = options.endstring;
size_t min = options.min, max = options.max;
size_t i, j;
unsigned long long temp;
int check_dupes; /* need to take duplicates into account */
unsigned long long extra_unicode_bytes = 0;
/* parameters for counting taking dupes into consideration */
size_t first, last; /* index of the first character and last character */
int start_point = 0, end_point = 0; /* bool; whether to consider a starting or ending string */
if (max == 0) {
*lines = 0;
*bytes = 0;
return;