kzhr's diary

ad ponendum

国語研天草版テキストから対比できそうな四つ仮名を取り出すスクリプト

#!perl

use strict;
use warnings;
use utf8;
use open qw/:std :utf8/;

my $substr = 3; # まとめ文字数

my $file = shift @ARGV;

open my $fh, "<", $file or die;

my %match;
my $fno = '';
my $lno = 1;
my $llno = 0;
my $skip = 0;

while (my $line = <$fh>) {
  chomp $line;
  $llno++;
  if (not $skip) {
    $skip = 1 if $line eq '(扉)';
    $fno = '扉' if $skip;
    next;
  }
  if ($line =~ /^((.+))$/) {
    $fno = $1;
    $lno = 1;
    next;
  }
  next if $line !~ /^\d/;
  while ($line =~ /\b\w*zzu\w*\b/ig) {
    my $m = my $m_mod = $&;
    $m_mod = lc $m_mod;
    $m_mod =~ s/zzu/zu/ig;
    $m_mod =~ s/gi/ji/ig;
    my $m3 = substr $m_mod, 0, $substr;
    push @{$match{$m3}{zzu}}, [$m, $fno, $lno, $llno];
  }
  while ($line =~ /\b\w*(?<!z)zu\w*\b/ig) {
    my $m = my $m_mod = $&;
    $m_mod = lc $m_mod;
    $m_mod =~ s/zzu/zu/ig;
    $m_mod =~ s/gi/ji/ig;
    my $m3 = substr $m_mod, 0, $substr;
    push @{$match{$m3}{zu}}, [$m, $fno, $lno, $llno];
  }
  while ($line =~ /\b\w*ji\w*\b/ig) {
    my $m = my $m_mod = $&;
    $m_mod = lc $m_mod;
    $m_mod =~ s/zzu/zu/ig;
    $m_mod =~ s/gi/ji/ig;
    my $m3 = substr $m_mod, 0, $substr;
    push @{$match{$m3}{ji}}, [$m, $fno, $lno, $llno];
  }
  while ($line =~ /\b\w*gi\w*\b/ig) {
    my $m = my $m_mod = $&;
    $m_mod = lc $m_mod;
    $m_mod =~ s/zzu/zu/ig;
    $m_mod =~ s/gi/ji/ig;
    my $m3 = substr $m_mod, 0, $substr;
    push @{$match{$m3}{gi}}, [$m, $fno, $lno, $llno];
  }
  $lno++;
}

my @keys = sort keys %match;
for my $key (@keys) {
  if (exists $match{$key}{ji} && exists $match{$key}{gi} or exists $match{$key}{zu} && exists $match{$key}{zzu}) {
    my $m = $match{$key};
    my @yotsu_keys = sort %{$m};
    for my $yotsu_key (@yotsu_keys) {
      for my $y (@{$m->{$yotsu_key}}) {
        printf "%s\t%s\t%s\t%s\t%s\t%s\n", $key, $yotsu_key, @$y;
      }
    }
  }
}

ばあいによって、テキストデータのBOMを落としたり改行コードを直したりする必要はある。対比ではなく全例を見たいときは、if (exists…)のif文をコメントアウトすればよい。

IDSをふくむテクストを一字づつ切り出すperlスクリプト

use strict;
use warnings;
use utf8;

sub ids_split {
	my $text = shift;
	my @chars = split //, $text;
	my @split;
	my $buffer_text = '';
	my $buffer_num = 0;
	for my $char (@chars) {
		if ($char =~ /[&#12272;&#12273;&#12276;&#12277;&#12278;&#12279;&#12280;&#12281;&#12282;&#12283;]/) {
			$buffer_num = $buffer_num ? $buffer_num + 2 : $buffer_num + 3;
		}
		elsif ($char =~ /[&#12274;&#12275;]/) {
			$buffer_num += $buffer_num ? $buffer_num + 3 : $buffer_num + 4;
		}

		if ($buffer_num) {
			$buffer_text .= $char;
			$buffer_num -= 1;
			next if $buffer_num;
			push @split, $buffer_text;
			$buffer_text = '';
		}
		else {
			push @split, $char;
		}
	}
	return @split;
}

1;

臨時仮名遣調査委員会議事速記録(1909)の著作権に関するおぼえがき

たとへば、国会会議録検索システム -FAQ-など。臨時仮名遣調査委員会議事速記録 - 国立国会図書館デジタルコレクションにあるが、文部大臣官房図書課だけの著作権を認めてよいのかはよく分からないところなので(あるいは個々に調べてあるのかもしれないが、DBには載ってないので)、調べてみる。1967年までに全員没してゐればそれで当面はよいといふことになる。

索引にあるのは次の面々。

  1. 牧野伸顕(文部大臣)†1949
  2. 菊池大麓(委員長)†1917
  3. 曽我祐準(委員)†1935
  4. 松平正直(委員)†1915
  5. 浅田徳則(委員)†1933
  6. 岡部長職(委員)†1925
  7. 矢野文雄(委員)†1931
  8. 森林太郎(委員)†1922
  9. 岡野敬次郎(委員)†1925
  10. 伊知地彦次郎(委員)*1†1912
  11. 伊沢修二(委員)†1917
  12. 芳賀矢一(委員)†1927
  13. 藤岡好古(委員)†1917
  14. 大槻文彦(委員)†1928
  15. 江原素六(委員)†1922
  16. 三宅雄二(次)郎(委員)†1945
  17. 渡部董之介(番外*2)†1938

索引にないが著作者として現れてゐるのは次の面々。

  1. 井上毅(附録)†1895
  2. 米国大使館ドッジ氏(附録)不明

なほ、発言してゐないが委員なのは次の面々。

  1. 小牧昌業
  2. 山川健次郎
  3. 小松謙二郎
  4. 井上哲次郎
  5. 上田萬年
  6. 徳富猪一郎
  7. 横井時雄
  8. 松村茂助
  9. 島田三郎
  10. 鎌田栄吉
  11. 板根友敬
  12. 土舘長言
  13. 肥塚籠

*1:ママ。伊地知彦次郎。

*2:事務局。名簿には臨時仮名遣調査委員会主事被仰付とある。ただ、これは職務著作と見て、彼の没年は考へないといふ見方もできよう。

「日本古典籍字形データセットをかんたんに分類してくれるPythonスクリプト」の結果を(手で)修正した量の統計を出すスクリプト

#!/usr/bin/perl

use 5.012;

use strict;
use warnings;
use utf8;

use Cwd;
use FindBin;

use Data::Dumper;

require "$FindBin::Bin/clgr.pl";

my $wd = Cwd::getcwd();
my @directories = clgr::scan_dir($wd);

say join ",", 'Mat', 'U+ID', 'Total', 'Err', 'Err%', 'Orig No C', 'Mod No C', 'Diff';

for my $dir (@directories) {
  my @path = clgr::gen_path($dir);
  my $csv = clgr::read_csv($dir);
  my $data = clgr::cluster($csv);

  my %stat = ();
  my $i = 0;
  my %c_no = ();

  for my $c (@$data) {
    $c_no{mod}{$i}++;
    for my $item (@$c){
      $stat{$i}{$item->[1]}++;
      $c_no{org}{$item->[1]}++;
    }
    $i++;
  }
  my @stat_keys = sort keys %stat;
  my $error_no = 0;
  my $total = 0;

  for my $stat_key (@stat_keys) {
    my $cluster_max = 0;
    my $cluster_total = 0;
    my @original_stat_keys = sort keys %{$stat{$stat_key}};
    for my $original_stat_key (@original_stat_keys) {
      $cluster_total += $stat{$stat_key}{$original_stat_key};
      $cluster_max = $stat{$stat_key}{$original_stat_key} > $cluster_max ? $stat{$stat_key}{$original_stat_key} : $cluster_max;
    }
    $total += $cluster_total;
    $error_no += $cluster_total - $cluster_max;
  }

  my $orig_no_c = scalar keys %{$c_no{org}};
  my $mod_no_c = scalar keys %{$c_no{mod}};
  # 'Mat', 'U+ID', 'Total', 'Err', 'Err%', 'Orig No C', 'Mod No C', 'Diff b/w Orig & Mod No C'
  say join ",", $path[-3], $path[-1], $total, $error_no, sprintf("%.2f", $error_no / $total * 100), $orig_no_c, $mod_no_c, abs($orig_no_c - $mod_no_c);
#  warn Dumper($data, \%stat);
#  last;
}

「日本古典籍字形データセットをかんたんに分類してくれるPythonスクリプト」の結果を修正してHTMLに再出力するスクリプト

#!/usr/bin/perl

use 5.012;

use strict;
use warnings;
use utf8;

use Cwd;
use FindBin;

use Data::Dumper;

require "$FindBin::Bin/clgr.pl";

my $now = time();

my $wd = Cwd::getcwd();
my @directories = clgr::scan_dir($wd);

for my $dir (@directories) {
    my @path = clgr::gen_path($dir);
    my $html_file = "$path[-3]/$path[-3]_mod.html";
    my $csv_file = "$path[-3]/$path[-3].csv";
    if (not -e $html_file or (stat $html_file)[9] < $now) {
        open my $html_fh, ">", $html_file or die $html_file;
        print $html_fh clgr::regen_html_header();
    }
    if (not -e $csv_file or (stat $csv_file)[9] < $now) {
        open my $csv_fh, ">", $csv_file or die $csv_file;
        print $csv_fh clgr::regen_csv_header(); 
    }

    my $data = clgr::read_csv($dir);
    my $cluster = clgr::cluster($data);
    my $html = clgr::regen_html($path[-1], "$wd/$path[-3]", $cluster);
    my $csv = clgr::regen_csv($path[-1], $path[-3], $cluster);
    open my $html_fh, ">>", $html_file or die $html_file;
    print $html_fh $html;
    open my $csv_fh, ">>", $csv_file or die $csv_file;
    print $csv_fh $csv;
}

clgr.pl

#!which perl

package clgr;

use strict;
use warnings;
use utf8;
use File::Spec;
use Image::Size qw/html_imgsize/;

my @kana = (
    'U+3042', 'U+3044', 'U+3046', 'U+3048', 'U+304A', 'U+304B',
    'U+304C', 'U+304D', 'U+304E', 'U+304F', 'U+3050', 'U+3051',
    'U+3052', 'U+3053', 'U+3054', 'U+3055', 'U+3056', 'U+3057',
    'U+3058', 'U+3059', 'U+305A', 'U+305B', 'U+305C', 'U+305D',
    'U+305E', 'U+305F', 'U+3060', 'U+3061', 'U+3062', 'U+3064',
    'U+3065', 'U+3066', 'U+3067', 'U+3068', 'U+3069', 'U+306A',
    'U+306B', 'U+306C', 'U+306D', 'U+306E', 'U+306F', 'U+3070',
    'U+3072', 'U+3073', 'U+3075', 'U+3076', 'U+3078', 'U+3079',
    'U+307B', 'U+307C', 'U+307E', 'U+307F', 'U+3080', 'U+3081',
    'U+3082', 'U+3084', 'U+3086', 'U+3088', 'U+3089', 'U+308A',
    'U+308B', 'U+308C', 'U+308D', 'U+308F', 'U+3090', 'U+3091',
    'U+3092', 'U+3093'
);

sub scan_dir {
    my $dir = shift;
    $dir .= '/' if $dir !~ m!/\z!ms;
    my @dirs = ();
    opendir(my $dh, $dir) or die $dir;
    while (my $rdir = readdir($dh)) {
        next if $rdir =~ /\A\.+\z/ms;
        if (-d $dir . $rdir) {
            if (grep {$_ eq $rdir} @kana) {
                push @dirs, $dir . $rdir;
            }
            else {
                my @subdir = scan_dir($dir . $rdir);
                push @dirs, @subdir;
            }
        }
    }
    return @dirs;
}

sub read_csv {
    my $dir = shift;
    my @csv = ();
    $dir =~ m!/(U[^/]+)\z!ms;
    my $file = $1 . ".csv";
    open my $fh, "$dir/$file" or die "$dir/$file";
    my $head = 0;
    while(my $line = <$fh>) {
        next if not $head++;
        chomp $line;
        $line =~ tr/"//d;
        warn $& if $line =~ /[^\.,\/\+\_A-Za-z0-9]/;
        my @split = split ',', $line;
        if (not $split[2] and $split[0]) {
            $split[2] = $split[0];
        }
        push @csv, \@split;
    }
    return \@csv;
}

sub cluster {
    my $csv = shift;
    my @cluster = ();
    for my $array (@$csv) {
        push @{$cluster[$array->[2]]}, [$array->[1], $array->[0], $array->[3]]; # $cluster{<mod cl>} = [<seq>, <org cl>, <path>]
    }
    @cluster = sort { scalar(@$b) <=> scalar(@$a) } map { [ sort { $a->[0] <=> $b->[0] } @$_ ] } grep { $_ } @cluster;
    return \@cluster;
}

sub regen_html_header {
    return <<EOF;
<html><head><style>span.nobr{white-space:nowrap;}</style></head><body>
EOF
}

sub regen_html {
    my $uni = shift;
    my $html_dir = shift;
    my $cluster = shift;
    my $html = "<h1>$uni</h1>\n";
    my $i = 1;
    for my $c (@$cluster) {
        $html .= "<h2>Cluster $i: " . scalar(@$c) . " items</h2>\n";
        $html .= "<p>";
        for my $item (@$c) {
            my $img_loc = File::Spec->rel2abs($item->[2], $html_dir);
            my $size = html_imgsize($img_loc);
            $html .= qq(<span class="nobr"><img src="$item->[2]" $size>$item->[0]</span> );
        }
        $i++;
        $html .= "</p>\n";
    }
    return $html;
}

sub regen_csv_header {
    return join(',', 'mat', 'u', 'cluster', 'counts') . "\n";
}

sub regen_csv {
    my $uni = shift;
    my $mat = shift;
    my $cluster = shift;
    my $csv = '';
    my $i = 1;
    for my $c (@$cluster) {
        $csv .= join ',', $mat, $uni, $i, scalar @$c;
        $csv .= "\n";
        $i++;
    }
    return $csv;
}

sub gen_path {
    my $basedir = shift;
    return File::Spec->splitdir($basedir);
}

1;

日本古典籍字形データセットをかんたんに分類してくれるPythonスクリプト

#!/usr/bin/python3
# coding: utf-8

#
# Usage: Run on the directory just above where the Dataset of PMJT Character Shapes
# (http://codh.rois.ac.jp/char-shape/) is downloaded
#

from pathlib import Path
from time import time

from PIL import Image
from pyclustering.cluster.xmeans import xmeans
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer

ext = '.jpg'

kana = [
    'U+3042', 'U+3044', 'U+3046', 'U+3048', 'U+304A', 'U+304B',
    'U+304C', 'U+304D', 'U+304E', 'U+304F', 'U+3050', 'U+3051',
    'U+3052', 'U+3053', 'U+3054', 'U+3055', 'U+3056', 'U+3057',
    'U+3058', 'U+3059', 'U+305A', 'U+305B', 'U+305C', 'U+305D',
    'U+305E', 'U+305F', 'U+3060', 'U+3061', 'U+3062', 'U+3064',
    'U+3065', 'U+3066', 'U+3067', 'U+3068', 'U+3069', 'U+306A',
    'U+306B', 'U+306C', 'U+306D', 'U+306E', 'U+306F', 'U+3070',
    'U+3072', 'U+3073', 'U+3075', 'U+3076', 'U+3078', 'U+3079',
    'U+307B', 'U+307C', 'U+307E', 'U+307F', 'U+3080', 'U+3081',
    'U+3082', 'U+3084', 'U+3086', 'U+3088', 'U+3089', 'U+308A',
    'U+308B', 'U+308C', 'U+308D', 'U+308F', 'U+3090', 'U+3091',
    'U+3092', 'U+3093', 'U+309D', 'U+309E', 'U+30B5', 'U+30C4',
    'U+30CB', 'U+30F6'
        ]

basedir = Path.cwd()
now = time()

def detect_dirs(scandir=None)->list:
    '''
    detecting which directories would be scanned,
    which should be listed in the kana list
    '''
    if not scandir:
        scandir = Path(basedir)
    dirs = []
    for component in scandir.iterdir():
        if component.name in kana:
            dirs.append(component)
        elif component.is_dir():
            dirs += detect_dirs(component)
    return dirs

def scan_dir(scandir)->list:
    '''
    scanning the size of images
    it will dig a single-level of sub-directories
    '''
    if not scandir.is_dir():
        return []
    files = []
    scan_base = scandir.parts[-1]

    for f in sorted(scandir.iterdir()):
        if f.is_dir():
            for sf in sorted(f.iterdir()):
                if sf.name.endswith(ext) and sf.is_file():
                    files.append(is_img(sf))
        elif f.name.endswith(ext) and f.is_file():
            files.append(is_img(f))

    return files

def is_img(filename: str)->list:
    '''
    detect whether it is image file or not
    '''
    return [filename, img_size(filename)]

def img_size(filename: str)->list:
    '''
    return the size of images
    '''
    img = Image.open(filename, 'r')
    return list(img.size)

def calc_xmeans(files: list)->list:
    '''
    calculate the x-kernels
    '''
    sample = []
    for f in files:
        sample.append(f[1])

    if len(sample) < 15:
        amount = 1
    elif len(sample) > 50:
        amount = 3
    else:
        amount = 2

    xm_c = kmeans_plusplus_initializer(sample, amount).initialize()
    xm_i = xmeans(sample, xm_c, ccore=True)
#    xm_i = xmeans(sample, xm_c, ccore=False)    # Use this line on Darwin, and pray

    xm_i.process()
    clusters = xm_i.get_clusters()

    clgr = []

    j = 0
    for c in clusters:
        container = []
        for i in c:
            container.append([j, i, files[i]])
        clgr.append(container)
        j += 1

    return clgr

def export(data: list, directory)->None:
    '''
    export to html and csv calculated classifications
    '''
    export_csv = directory / (directory.parts[-1] + '.csv')
    export_html = Path(directory / '../../').resolve() / (directory.parts[-3] + '.html')
    if export_html.exists() and export_html.stat().st_mtime < now:
        with export_html.open(mode='w') as exh:
            exh.write('<html><head><style>span.nobr{white-space:nowrap;}</style></head><body>')

    with export_csv.open(mode='w') as exc:
        exc.write(','.join(['cluster no','seq','mod cluster no','file name']) + '\n')
        for c in data:
            for i in c:
                exc.write(','.join([str(i[0]), str(i[1]),'',\
                        str(item[2][0].relative_to(export_html.parent)) + '\n')

    with export_html.open(mode='a') as exh:
        exh.write('<h1>' + str(directory.parts[-1]) + '</h1>\n')
        exh.write('<p>The number of clusters: ' + str(len(data)) + '</p>\n')
        i = 0
        for c in data:
            i += 1
            exh.write('<h2>Cluster ' + str(i) + '</h2>\n')
            exh.write('<p>')
            for item in c:
                exh.write('<span class="nobr"><img src="' +\
                        str(item[2][0].relative_to(export_html.parent)) +\
                        '" width="' + str(item[2][1][0]) + '" height="' +\
                        str(item[2][1][1]) + '">' + str(item[1]) + '</span> ')
            exh.write('</p>\n')

dirs = detect_dirs()

for directory in sorted(dirs):
    dir_scan = scan_dir(directory)
    xm_clusters = calc_xmeans(files=dir_scan)
    export(data=xm_clusters, directory=directory)