HEX

File: //usr/share/doc/tin/tools/w2r.pl
#! /usr/bin/perl -w
#
# reads a tin filter file with wildmat filters on STDIN, converts it to
# regexp filters and returns it on STDOUT
#
# 2020-11-10 <urs@tin.org>
#
# NOTE: don't use w2r.pl on regexp filters
#
# for case optimization of your regexp filters use opt-case.pl, i.e.:
# w2r.pl < wildmat-filter-file | opt-case.pl > regexp-filter-file
#
# for joining regexp filters with the same group= and score= use
# joinf.pl (not written yet)

# perl 5 is needed for lookahead assertions and perl < 5.004 is know
# to be buggy
require 5.004;

# version Number
# $VERSION = "0.2.8";

while (defined($line = <>)) {
	chomp $line;

	# ignore comments etc.
	if ($line =~ m/^(?:[#\s]|$)/o) {
		print "$line\n";
		next;
	}

	# skip 'empty' patterns, they are nonsense
	next if ($line =~ m/^[^=]+=$/o);

	# lines which needs to be translated
	if ($line =~ m/^(subj|from|msgid(?:|_last|_only)|refs_only|xref|path)=(.*)$/o) {
		printf ("$1=%s\n", w2p($2));
		next;
	}

	# other lines don't need to be translated
	print "$line\n";
}


# turns a wildmat into a regexp
sub w2p {
	local ($wild)  = @_;	# input line
	my $cchar = "";		# current char
	my $lchar = "";		# last char
	my $reg = "";		# translated char
	$bmode = 0;		# inside [] ?
	$rval = "";		# output line

	# break line into chars
	while ($wild =~ s/(.)//) {
		$cchar = $1;

		# if char is a [, and we aren't allreay in []
		if ($lchar !~ m/\\/o && $cchar =~ m/\[/o) {
			$bmode++;
			$reg = $cchar;
		}

		# if char is a ], and we were in []
		if ($lchar !~ m/\\/o && $cchar =~ m/\]/o) {
			$bmode--;
			$reg = $cchar;
		}

		# usual cases
		if ($bmode == 0 && $lchar !~ m/\\/o) {
			$reg = $cchar;
			$reg =~ s/\t/\\t/o;	# translate tabs
			$reg =~ s/\./\\./o;	# quote .
			$reg =~ s/\)/\\)/o;	# quote )
			$reg =~ s/\(/\\(/o;	# quote (
			$reg =~ s/\*/\.*/o;	# translate *
			$reg =~ s/\?/\./o;	# translate ?
			$reg =~ s/\^/\\^/o;	# quote ^
			$reg =~ s/\$/\\\$/o;	# quote $
		}

		# if last char was a qute, current char can't be a meta
		if ($lchar =~ m/\\/o || $bmode != 0) {
			$reg = $cchar;
			$cchar =~ s/\\//o;	# skip 2nd \\ inside []
		}

		$lchar = $cchar;	# store last char
		$rval = $rval.$reg;	# build return string
	}

	# common abbreviations
	# TODO: make this global
	# replace [0-9] with [\d] in the first []
	# replace [a-zA-Z0-9_] with [\w] in the first []
	# replace [a-zA-Z0-9] with [^\W_] in the first []
	# replace [a-zA-Z] with [^\W\d_] in the first []
	$rval =~ s/^([^\[]*)\[0-9\]/$1\[\\d\]/o;
	$rval =~ s/([^\[]*)\[a-za-z0-9_\]/$1\[\\w\]/io;
	$rval =~ s/([^\[]*)\[a-za-z0-9\]/$1\[^\\W_\]/io;
	$rval =~ s/([^\[]*)\[a-za-z\]/$1\[^\\W\\d_\]/io;

	# optimizations
	#
	# add ^-anchor if needed
	$rval =~ s/^(?!\.\*)(.*)/\^$1/o;
	# add $-anchor if needed
	$rval =~ s/^((?:.*)(?:[^.][^*]))$/$1\$/o;
	# remove leading .* if allowed
	$rval =~ s/^\.\*(?!$)//o;
	# remove tailing .* if allowed
	$rval =~ s/(.+)\.\*$/$1/o;

	return $rval;
}

__END__

=head1 NAME

w2r.pl - Convert tin wildmat filters to tin regexp filters

=head1 SYNOPSIS

B<w2r.pl> E<lt> I<input> [E<gt> I<output>]

=head1 DESCRIPTION

B<w2r.pl> reads a L<tin(1)> filter file with wildmat filters on STDIN,
converts it to regexp filters and returns it on STDOUT.

=head1 NOTES

Don't use B<w2r.pl> on regexp filter files

=head1 AUTHOR

Urs Janssen E<lt>urs@tin.orgE<gt>

=head1 SEE ALSO

L<tin(1)>, L<tin(5)>

=cut