kzhr's diary

ad ponendum

国語研天草版テキストから対比できそうな四つ仮名を取り出すスクリプト

#!perl

use strict;
use warnings;
use utf8;
use open qw/:std :utf8/;

my $substr = 3; # まとめ文字数

my $file = shift @ARGV;

open my $fh, "<", $file or die;

my %match;
my $fno = '';
my $lno = 1;
my $llno = 0;
my $skip = 0;

while (my $line = <$fh>) {
  chomp $line;
  $llno++;
  if (not $skip) {
    $skip = 1 if $line eq '(扉)';
    $fno = '扉' if $skip;
    next;
  }
  if ($line =~ /^((.+))$/) {
    $fno = $1;
    $lno = 1;
    next;
  }
  next if $line !~ /^\d/;
  while ($line =~ /\b\w*zzu\w*\b/ig) {
    my $m = my $m_mod = $&;
    $m_mod = lc $m_mod;
    $m_mod =~ s/zzu/zu/ig;
    $m_mod =~ s/gi/ji/ig;
    my $m3 = substr $m_mod, 0, $substr;
    push @{$match{$m3}{zzu}}, [$m, $fno, $lno, $llno];
  }
  while ($line =~ /\b\w*(?<!z)zu\w*\b/ig) {
    my $m = my $m_mod = $&;
    $m_mod = lc $m_mod;
    $m_mod =~ s/zzu/zu/ig;
    $m_mod =~ s/gi/ji/ig;
    my $m3 = substr $m_mod, 0, $substr;
    push @{$match{$m3}{zu}}, [$m, $fno, $lno, $llno];
  }
  while ($line =~ /\b\w*ji\w*\b/ig) {
    my $m = my $m_mod = $&;
    $m_mod = lc $m_mod;
    $m_mod =~ s/zzu/zu/ig;
    $m_mod =~ s/gi/ji/ig;
    my $m3 = substr $m_mod, 0, $substr;
    push @{$match{$m3}{ji}}, [$m, $fno, $lno, $llno];
  }
  while ($line =~ /\b\w*gi\w*\b/ig) {
    my $m = my $m_mod = $&;
    $m_mod = lc $m_mod;
    $m_mod =~ s/zzu/zu/ig;
    $m_mod =~ s/gi/ji/ig;
    my $m3 = substr $m_mod, 0, $substr;
    push @{$match{$m3}{gi}}, [$m, $fno, $lno, $llno];
  }
  $lno++;
}

my @keys = sort keys %match;
for my $key (@keys) {
  if (exists $match{$key}{ji} && exists $match{$key}{gi} or exists $match{$key}{zu} && exists $match{$key}{zzu}) {
    my $m = $match{$key};
    my @yotsu_keys = sort %{$m};
    for my $yotsu_key (@yotsu_keys) {
      for my $y (@{$m->{$yotsu_key}}) {
        printf "%s\t%s\t%s\t%s\t%s\t%s\n", $key, $yotsu_key, @$y;
      }
    }
  }
}

ばあいによって、テキストデータのBOMを落としたり改行コードを直したりする必要はある。対比ではなく全例を見たいときは、if (exists…)のif文をコメントアウトすればよい。