#!/usr/bin/perl -sp0777 # BASIC text listing cruncher # Greg Cook, 23/Aug/2011 # Usage: perl -s crunch.pl # [-h] [-l] [-w=WIDTH] INFILE > OUTFILE 2> VARLIST # Warning! Correctness definitely not guaranteed! # Does: space removal, numeric constant compaction. # Does (if -h specified): keyword abbreviation, comment removal, # variable name compaction, line consolidation, some redundant # parenthesis removal. # Doesn't do: DATA/VDU consolidation, IF FALSE ELSE optimisation, # full redundant parenthesis removal, higher-level optimisations. # Mangles: PROC/FN names containing keywords, # numeric constants . .E .E0 .E+ etc. # NB: If listing has line numbers but no GOTOs etc., specify -l. # -w specifies maximum length of text between (and excluding) # newline sequences (CR LF). -w=238 for AUTO entry (default), # -w=233 to leave room for longest line number # Leaves already crunched programs unchanged $KEYWORDS="ABS|ACS|ADVAL|AND|ASC|ASN|ATN|AUTO|BGET|BPUT" ."|CALL|CHAIN|CLG|CLEAR|CLOSE|CLS|COLOUR|COS|COUNT" ."|DATA|DEF|DEG|DIM|DIV|DRAW|ELSE|END|ENDPROC" ."|ENVELOPE|EOF|EOR|ERL|ERR|ERROR|EVAL|EXP|EXT" ."|FALSE|FN|FOR|GCOL|GET|GOSUB|GOTO|HIMEM|IF" ."|INKEY|INPUT|INT|LEN|LET|LINE|LN|LOCAL|LOG" ."|LOMEM|MOD|MODE|MOVE|NEW|NEXT|NOT|OFF|ON|OPENIN" ."|OPENOUT|OPENUP|OR|OSCLI|PAGE|PI|PLOT|POS|PRINT" ."|PROC|PTR|RAD|READ|REM|REPEAT|REPORT|RESTORE" ."|RETURN|RUN|SGN|SIN|SOUND|SPC|SQR|STEP|STOP|TAN" ."|THEN|TIME|TOP|TO|TRACE|TRUE|UNTIL|USR|VAL|VDU" ."|VPOS|WIDTH"; $KEYWORDS2="CHR\\\$|DELETE|GET\\\$|INKEY\\\$|INSTR\\(" ."|LEFT\\\$\\(|LIST|LOAD|MID\\\$\\(|NEW|OLD" ."|POINT\\(|RENUMBER|RND|RIGHT\\\$\\(|SAVE|STR\\\$" ."|STRING\\\$\\(|TAB\\("; $KEYWORDS3="ADC|AND|ASL|BCC|BCS|BEQ|BIT|BMI|BNE|BPL" ."|BRK|BVC|BVS|CLC|CLD|CLI|CLV|CMP|CPX|CPY" ."|DEC|DEX|DEY|EOR|INC|INX|INY|JMP|JSR|LDA" ."|LDX|LDY|LSR|NOP|ORA|PHA|PHP|PLA|PLP|ROL" ."|ROR|RTI|RTS|SBC|SEC|SED|SEI|STA|STX|STY" ."|TAX|TAY|TSX|TXA|TXS|TYA|OPT|EQUB|EQUW" ."|EQUD|EQUS"; ($KEYWORDS4=$KEYWORDS3) =~ tr/A-Z/a-z/; $ONEARG="ABS|ACS|ADVAL|ASC|ASN|ATN|CHAIN|CHR\\\$|COLOUR|COS" ."|DEG|EVAL|EXP|INKEY\\\$?|INT|LEN|LN|LOG|MODE|NOT" ."|OPENIN|OPENOUT|OPENUP|OSCLI|RAD|SGN|SIN|SPC|SQR" ."|STR\\\$~?|TAN|UNTIL|USR|VAL"; $STARCMD="(?:^|:|EL\\.|REP\\.|TH\\.)\\*"; $w ||= 238; die "characters >&80 reserved" if y/\x80-\xFF//; y/\r//d; #convert from DOS or BBC newlines s/^ +//mg; # delete leading spaces s/ +$//mg; # delete trailing spaces s/^\d+ *//mg if ($l || $h); # delete line numbers # protect spaces in strings while(s/^((?:[^"\n]*"[^"\n]*")*[^"\n]*")([^"\n\x80-\xFF]+)/ ($i=$2)=~y|\x00-\x7F|\x80-\xFF|,$1.$i/egmx) {} s/(^\d*|:) *REM (?! )/$1REM/g; # delete first space unless ascii art while(s/((?:^\d*|:)REM.*) /$1\xA0/g) {} # protect space in comments # delete leading spaces in assembly comments unless ascii art s/\\ (?! )/\\/g; while(s/(\\.*) /$1\xA0/g) {} # protect spaces in assembly comments while(s/($STARCMD.*) /$1\xA0/g) {} # and in *commands s/ +/ /g; # collapse strings of spaces s/" "/"\xA0"/g; # prevent merging of strings # prevent merging/non-parsing of keywords (END covers END PROC case) s/(COU\.|COUNT|END|E\.|ENDPROC|ERL|ERR|FA\.|FALSE|H\.|HIMEM|LOM\. |LOMEM|PA\.|PAGE|PI|POS|TI\.|TIME|TOP|TRU\.|TRUE|VP\.|VPOS) \ (?=[A-Za-z0-9_`])/$1\xA0/gx; s/ERR OR/ERR\xA0OR/g; s/GET \$/GET\xA0\$/g; s/INKEY \$/INKEY\xA0\$/g; s/MOD E/MOD\xA0E/g; s/TO P/TO\xA0P/g; # strip leading zeros from hex constants, make decimal if <10 s/\&0*([0-9A-F]+)/(hex($1)<10)?hex($1):"&".$1/eg; # if crunching hard and decimal is shorter, use it s/\&0*([0-9A-F]+)/(length(hex($1))\[\\\]^{|}~]) /$1/g; # before certain other characters s/ ([!"#&'()*+,\-\/:;<=>?@\[\\\]^{|}~])/$1/g; # after keywords # keywords ending in symbols already dealt with # commands aren't worth listing here, and RND is special s/($KEYWORDS)\ /$1/gx; # after assembly mnemonics within square brackets # but not before FN calls while(s/(\[[^]]*(?:^|:|(?<=\[))(?:$KEYWORDS3|$KEYWORDS4)) \ (?!$KEYWORDS|$KEYWORDS2)/$1/gmx) {} # delete unnecessary THENs s/(?:THEN|TH\.)(?!=|\*|\d|H\.|HIMEM|LOM\.|LOMEM|PA\.|PAGE|PT\.|PTR |TI\.|TIME)//gx; # delete parentheses around simple arguments to unary fns/cmds while(s/($ONEARG)\(([+-?!]*(?:[0-9.]+(?:E[+-]?\d+)?|&[0-9A-F]+ |"(?:""|[^"])*"|[A-Za-z_`][A-Za-z0-9_`]*[%\$]?))\) /$1$2/gx){} if($h) { s/(?:^|:)REM.*//mg; # delete comments s/\\.*//g; # delete assembly comments s/\:$//mg; # delete trailing colons # abbreviate keywords, only when it save space s/ADVAL/AD./g;s/AND/A./g;s/BGET/B./g; s/CALL/CA./g;s/CHAIN/CH./g;s/CLEAR/CL./g; s/CLOSE/CLO./g;s/COLOUR/C./g;s/COUNT/COU./g; s/DATA/D./g;s/DRAW/DR./g;s/ELSE/EL./g; s/ENDPROC/E./g;s/ENVELOPE/ENV./g;s/ERROR/ERR./g; s/EVAL/EV./g;s/FALSE/FA./g;s/FOR/F./g; s/GCOL/GC./g;s/GET\$/GE./g;s/GOSUB/GOS./g; s/GOTO/G./g;s/HIMEM/H./g;s/INKEY\$/INK./g; s/INPUT/I./g;s/INSTR\(/INS./g;s/LEFT\$\(/LE./g; s/LOCAL/LOC./g;s/LOMEM/LOM./g;s/MID\$\(/M./g; s/MODE/MO./g;s/NEXT/N./g;s/OPENIN/OP./g; s/OPENOUT/OPENO./g;s/OSCLI/OSC./g;s/PAGE/PA./g; s/PLOT/PL./g;s/POINT\(/PO./g;s/PRINT/P./g; s/REPEAT/REP./g;s/REPORT/REPO./g;s/RESTORE/RES./g; s/RETURN/R./g;s/RIGHT\$\(/RI./g;s/SOUND/SO./g; s/STEP/S./g;s/STRING\$\(/STRI./g;s/TAN/T./g; s/THEN/TH./g;s/TIME/TI./g;s/TRACE/TR./g; s/UNTIL/U./g;s/VDU/V./g;s/VPOS/VP./g; s/WIDTH/WI./g; # compact variable names # protect *commands s/($STARCMD)(.*$)/($i=$2)=~y|\x00-\x7F|\x80-\xFF|,$1.$i/egm; # protect hex constants s/(\&[0-9A-F]+)/($i=$1)=~y|\x00-\x7F|\x80-\xFF|,$i/eg; # protect keywords while(s/(\[[^]]*(?:^|:|(?<=\[)))($KEYWORDS3|$KEYWORDS4) /($i=$2)=~y|\x00-\x7F|\x80-\xFF|,$1.$i/egmx) {} s/((?length($b) ||reverse($a) cmp reverse($b)} keys %proc) { $foo =~ /^(PROC|FN)/; $prefix = $1; $proc{$foo} = $prefix.reverse($proctop{$prefix}); print STDERR $proc{$foo},"\t",$foo,"\n"; if(++$proctop{$prefix} eq 'AA') { $proctop{$prefix} = 'a'; } } s/(\xD0\xD2\xCF\xC3|\xC6\xCE)([A-Za-z0-9_`]+) /($i=$1)=~y|\x80-\xFF|\x00-\x7F|, ($j=defined($proc{$i.$2})?$proc{$i.$2}:$i.$2) =~y|\x00-\x7F|\x80-\xFF|, $j/egx; # .E, .E0 etc are valid numbers but the E, E0 are picked up # not cheap to determine if . belongs to a partial keyword # e.g. COL.E (var), IF.E (num), MOD.E (var despite MOD). @vars = /((?>[A-Za-z_`](? IFD=0E=0 (No FN) # could resolve by inserting space before E for $special (qw(@% A A% C% E O% P% X X% Y Y%)) { if(exists($var{$special})) { $var{$special} = $special; print STDERR $var{$special},"\t",$special,"\n"; } $target{$special} = undef; } for $suffix ("","(","\$","\$\(","%","%(") { $vartop{$suffix}='A'; } for $foo (sort {length($a)<=>length($b) ||reverse($a) cmp reverse($b)} keys %var) { next if(defined($var{$foo})); $foo =~ /([\$\%]?\(?)$/; $suffix = $1; while(exists($target{reverse($vartop{$suffix}) .$suffix})) { if(++$vartop{$suffix} eq 'AA') { $vartop{$suffix}='a'; } } $var{$foo} = reverse($vartop{$suffix}).$suffix; $target{$var{$foo}} = undef; print STDERR $var{$foo},"\t",$foo,"\n"; } s/((?>[A-Za-z_`](?