I built this concordancer two years ago when I was crazy about Perl. This is the source code.
use warnings;
use strict;
use IO::File;
use File::Find;
use Getopt::Std;
use vars qw(
$usage
$SearchPattern
$filnm
$context
%kwicc
@kwic
$opt_l
$opt_r
$opt_c
$opt_m
$opt_t
$i);
$usage=<<EOF;
usage: skwic [ -lrct ][-m NUM] <pattern> <path>
-l: Sort by left context
-r: Sort by right context
-c: Sort by pattern
-m: Line limitation
-t: Display collocations
(In this version if you use option -t, search pattern
is restricted to a single word.)
EOF
die $usage unless scalar(@ARGV)>1;
$SearchPattern = shift;
CheckReMsg($SearchPattern);
$filnm=shift;
getopts( "lrctm:" );
if ($opt_t && $SearchPattern=~ /[^\w\d'-\\]/){die "In this version if you use option -t, search pattern is restricted to a single word.";}
if(($opt_l && $opt_r)||($opt_l && $opt_c)||($opt_c && $opt_r))
{die "argument error.";}
$context=40;
$i=0;
sub CheckReMsg{
my $pattern = shift;
die "Illegal pattern: $@ \n" unless my $re = compile_re($pattern);
}
sub compile_re{
my $pattern = shift;
my $re;
eval {$re = qr/$pattern/;};
return $re;
}
sub parsewL {
if ($opt_m){
if ($i==$opt_m){goto L;}}
my $rawstr=shift;
my @wordsb=();
while ($rawstr=~/\b[\w\d'-]+\b/gi){
push @wordsb, $&."_$i";
}
$i++;
return $wordsb[$#wordsb];
}
sub parsewR {
if ($opt_m){
if ($i==$opt_m){goto L;}}
my $rawstr=shift;
my @wordsb=();
while ($rawstr=~/\b[\w\d'-]+\b/gi){
push @wordsb, $&."_$i";}
$i++;
return $wordsb[0];
}
sub domatch
{
my $myfh=shift;
my $name=shift;
warn "$name is a directory\n" and return if -d $name;
local $/=undef;
while(<$myfh>)
{
$_ =~ s/\n/ /g;
while($_=~/$SearchPattern/g){
my $prepttn="";
my $postpttn="";
if(length($`)<=$context){$prepttn=$`;}
elsif(length($`)>$context){$prepttn=substr($`,-$context);}
if (length($')<=$context){$postpttn=$';}
elsif(length($')>$context){$postpttn=substr($',0,$context);}
no warnings;
if ($opt_l){$kwicc{parsewL($prepttn)}=" "x($context-length($prepttn)).$prepttn.$&.$postpttn."\n";
}
elsif ($opt_r){$kwicc{parsewR($postpttn)}=" "x($context-length($prepttn)).$prepttn.$&.$postpttn."\n";
}
elsif ($opt_c) {$kwicc{$&."_$i"}=" "x($context-length($prepttn)).$prepttn.$&.$postpttn."\n";
$i++;
if ($i==$opt_m){goto L;}
}
else {print " "x($context-length($prepttn)).$prepttn.$&.$postpttn."\n" ;
push @kwic," "x($context-length($prepttn)).$prepttn.$&.$postpttn."\n";
$i++;
if ($i==$opt_m){exit;}
}
}
}
close $myfh;
}
if ( -d $filnm )
{
find(
sub {
return unless -T;
my $fh = IO::File->new( $_ );
domatch $fh, $File::Find::name;
},
$filnm
);
}
else
{
use warnings;
open(my $fh, $filnm) or die "Unable to open: $! \n";
domatch $fh, $filnm;
}
L:
if ($opt_l || $opt_r || $opt_c){
foreach(sort keys %kwicc){
push @kwic, $kwicc{$_};
print $kwicc{$_};
}
}
if ($opt_t){collocate($SearchPattern, @kwic);}
sub collocate {
no warnings;
my ($keyword, @kwicfile)=@_;
my @token=();
my %count=();
my @gset;
foreach (@kwicfile){
$_ =~ s/\s+/ /g;
while (/(\b[\w\d'-]+\b)\s(\b[\w\d'-]+\b\s){0}\b$keyword\b/gi){
my $buff=$1;
$buff=~tr/A-Z/a-z/;
$count{"$buff..."}++;}
while (/\b$keyword\b(\s\b[\w\d'-]+\b){0}\s(\b[\w\d'-]+\b)/gi){
my $buff=$2;
$buff=~tr/A-Z/a-z/;
$count{"...$buff"}++;}
}
print"\n--------------------------\n";
while ((my $key, my $value) = each %count) {
push @gset,$value."_"."$key\n";
};
@gset=sort {int($a)<=>int($b)or $a cmp $b} @gset;
print reverse @gset;
}
http://www.corpus4u.org/upload/forum/2005080405494198.rar
use warnings;
use strict;
use IO::File;
use File::Find;
use Getopt::Std;
use vars qw(
$usage
$SearchPattern
$filnm
$context
%kwicc
@kwic
$opt_l
$opt_r
$opt_c
$opt_m
$opt_t
$i);
$usage=<<EOF;
usage: skwic [ -lrct ][-m NUM] <pattern> <path>
-l: Sort by left context
-r: Sort by right context
-c: Sort by pattern
-m: Line limitation
-t: Display collocations
(In this version if you use option -t, search pattern
is restricted to a single word.)
EOF
die $usage unless scalar(@ARGV)>1;
$SearchPattern = shift;
CheckReMsg($SearchPattern);
$filnm=shift;
getopts( "lrctm:" );
if ($opt_t && $SearchPattern=~ /[^\w\d'-\\]/){die "In this version if you use option -t, search pattern is restricted to a single word.";}
if(($opt_l && $opt_r)||($opt_l && $opt_c)||($opt_c && $opt_r))
{die "argument error.";}
$context=40;
$i=0;
sub CheckReMsg{
my $pattern = shift;
die "Illegal pattern: $@ \n" unless my $re = compile_re($pattern);
}
sub compile_re{
my $pattern = shift;
my $re;
eval {$re = qr/$pattern/;};
return $re;
}
sub parsewL {
if ($opt_m){
if ($i==$opt_m){goto L;}}
my $rawstr=shift;
my @wordsb=();
while ($rawstr=~/\b[\w\d'-]+\b/gi){
push @wordsb, $&."_$i";
}
$i++;
return $wordsb[$#wordsb];
}
sub parsewR {
if ($opt_m){
if ($i==$opt_m){goto L;}}
my $rawstr=shift;
my @wordsb=();
while ($rawstr=~/\b[\w\d'-]+\b/gi){
push @wordsb, $&."_$i";}
$i++;
return $wordsb[0];
}
sub domatch
{
my $myfh=shift;
my $name=shift;
warn "$name is a directory\n" and return if -d $name;
local $/=undef;
while(<$myfh>)
{
$_ =~ s/\n/ /g;
while($_=~/$SearchPattern/g){
my $prepttn="";
my $postpttn="";
if(length($`)<=$context){$prepttn=$`;}
elsif(length($`)>$context){$prepttn=substr($`,-$context);}
if (length($')<=$context){$postpttn=$';}
elsif(length($')>$context){$postpttn=substr($',0,$context);}
no warnings;
if ($opt_l){$kwicc{parsewL($prepttn)}=" "x($context-length($prepttn)).$prepttn.$&.$postpttn."\n";
}
elsif ($opt_r){$kwicc{parsewR($postpttn)}=" "x($context-length($prepttn)).$prepttn.$&.$postpttn."\n";
}
elsif ($opt_c) {$kwicc{$&."_$i"}=" "x($context-length($prepttn)).$prepttn.$&.$postpttn."\n";
$i++;
if ($i==$opt_m){goto L;}
}
else {print " "x($context-length($prepttn)).$prepttn.$&.$postpttn."\n" ;
push @kwic," "x($context-length($prepttn)).$prepttn.$&.$postpttn."\n";
$i++;
if ($i==$opt_m){exit;}
}
}
}
close $myfh;
}
if ( -d $filnm )
{
find(
sub {
return unless -T;
my $fh = IO::File->new( $_ );
domatch $fh, $File::Find::name;
},
$filnm
);
}
else
{
use warnings;
open(my $fh, $filnm) or die "Unable to open: $! \n";
domatch $fh, $filnm;
}
L:
if ($opt_l || $opt_r || $opt_c){
foreach(sort keys %kwicc){
push @kwic, $kwicc{$_};
print $kwicc{$_};
}
}
if ($opt_t){collocate($SearchPattern, @kwic);}
sub collocate {
no warnings;
my ($keyword, @kwicfile)=@_;
my @token=();
my %count=();
my @gset;
foreach (@kwicfile){
$_ =~ s/\s+/ /g;
while (/(\b[\w\d'-]+\b)\s(\b[\w\d'-]+\b\s){0}\b$keyword\b/gi){
my $buff=$1;
$buff=~tr/A-Z/a-z/;
$count{"$buff..."}++;}
while (/\b$keyword\b(\s\b[\w\d'-]+\b){0}\s(\b[\w\d'-]+\b)/gi){
my $buff=$2;
$buff=~tr/A-Z/a-z/;
$count{"...$buff"}++;}
}
print"\n--------------------------\n";
while ((my $key, my $value) = each %count) {
push @gset,$value."_"."$key\n";
};
@gset=sort {int($a)<=>int($b)or $a cmp $b} @gset;
print reverse @gset;
}
http://www.corpus4u.org/upload/forum/2005080405494198.rar