国語研天草版テキストから対比できそうな四つ仮名を取り出すスクリプト
#!perl use strict; use warnings; use utf8; use open qw/:std :utf8/; my $substr = 3; # まとめ文字数 my $file = shift @ARGV; open my $fh, "<", $file or die; my %match; my $fno = ''; my $lno = 1; my $llno = 0; my $skip = 0; while (my $line = <$fh>) { chomp $line; $llno++; if (not $skip) { $skip = 1 if $line eq '(扉)'; $fno = '扉' if $skip; next; } if ($line =~ /^((.+))$/) { $fno = $1; $lno = 1; next; } next if $line !~ /^\d/; while ($line =~ /\b\w*zzu\w*\b/ig) { my $m = my $m_mod = $&; $m_mod = lc $m_mod; $m_mod =~ s/zzu/zu/ig; $m_mod =~ s/gi/ji/ig; my $m3 = substr $m_mod, 0, $substr; push @{$match{$m3}{zzu}}, [$m, $fno, $lno, $llno]; } while ($line =~ /\b\w*(?<!z)zu\w*\b/ig) { my $m = my $m_mod = $&; $m_mod = lc $m_mod; $m_mod =~ s/zzu/zu/ig; $m_mod =~ s/gi/ji/ig; my $m3 = substr $m_mod, 0, $substr; push @{$match{$m3}{zu}}, [$m, $fno, $lno, $llno]; } while ($line =~ /\b\w*ji\w*\b/ig) { my $m = my $m_mod = $&; $m_mod = lc $m_mod; $m_mod =~ s/zzu/zu/ig; $m_mod =~ s/gi/ji/ig; my $m3 = substr $m_mod, 0, $substr; push @{$match{$m3}{ji}}, [$m, $fno, $lno, $llno]; } while ($line =~ /\b\w*gi\w*\b/ig) { my $m = my $m_mod = $&; $m_mod = lc $m_mod; $m_mod =~ s/zzu/zu/ig; $m_mod =~ s/gi/ji/ig; my $m3 = substr $m_mod, 0, $substr; push @{$match{$m3}{gi}}, [$m, $fno, $lno, $llno]; } $lno++; } my @keys = sort keys %match; for my $key (@keys) { if (exists $match{$key}{ji} && exists $match{$key}{gi} or exists $match{$key}{zu} && exists $match{$key}{zzu}) { my $m = $match{$key}; my @yotsu_keys = sort %{$m}; for my $yotsu_key (@yotsu_keys) { for my $y (@{$m->{$yotsu_key}}) { printf "%s\t%s\t%s\t%s\t%s\t%s\n", $key, $yotsu_key, @$y; } } } }
ばあいによって、テキストデータのBOMを落としたり改行コードを直したりする必要はある。対比ではなく全例を見たいときは、if (exists…)のif文をコメントアウトすればよい。
IDSをふくむテクストを一字づつ切り出すperlスクリプト
use strict; use warnings; use utf8; sub ids_split { my $text = shift; my @chars = split //, $text; my @split; my $buffer_text = ''; my $buffer_num = 0; for my $char (@chars) { if ($char =~ /[⿰⿱⿴⿵⿶⿷⿸⿹⿺⿻]/) { $buffer_num = $buffer_num ? $buffer_num + 2 : $buffer_num + 3; } elsif ($char =~ /[⿲⿳]/) { $buffer_num += $buffer_num ? $buffer_num + 3 : $buffer_num + 4; } if ($buffer_num) { $buffer_text .= $char; $buffer_num -= 1; next if $buffer_num; push @split, $buffer_text; $buffer_text = ''; } else { push @split, $char; } } return @split; } 1;
臨時仮名遣調査委員会議事速記録(1909)の著作権に関するおぼえがき
たとへば、国会会議録検索システム -FAQ-など。臨時仮名遣調査委員会議事速記録 - 国立国会図書館デジタルコレクションにあるが、文部大臣官房図書課だけの著作権を認めてよいのかはよく分からないところなので(あるいは個々に調べてあるのかもしれないが、DBには載ってないので)、調べてみる。1967年までに全員没してゐればそれで当面はよいといふことになる。
索引にあるのは次の面々。
- 牧野伸顕(文部大臣)†1949
- 菊池大麓(委員長)†1917
- 曽我祐準(委員)†1935
- 松平正直(委員)†1915
- 浅田徳則(委員)†1933
- 岡部長職(委員)†1925
- 矢野文雄(委員)†1931
- 森林太郎(委員)†1922
- 岡野敬次郎(委員)†1925
- 伊知地彦次郎(委員)*1†1912
- 伊沢修二(委員)†1917
- 芳賀矢一(委員)†1927
- 藤岡好古(委員)†1917
- 大槻文彦(委員)†1928
- 江原素六(委員)†1922
- 三宅雄二(次)郎(委員)†1945
- 渡部董之介(番外*2)†1938
索引にないが著作者として現れてゐるのは次の面々。
- 井上毅(附録)†1895
- 米国大使館ドッジ氏(附録)不明
なほ、発言してゐないが委員なのは次の面々。
「日本古典籍字形データセットをかんたんに分類してくれるPythonスクリプト」の結果を(手で)修正した量の統計を出すスクリプト
#!/usr/bin/perl use 5.012; use strict; use warnings; use utf8; use Cwd; use FindBin; use Data::Dumper; require "$FindBin::Bin/clgr.pl"; my $wd = Cwd::getcwd(); my @directories = clgr::scan_dir($wd); say join ",", 'Mat', 'U+ID', 'Total', 'Err', 'Err%', 'Orig No C', 'Mod No C', 'Diff'; for my $dir (@directories) { my @path = clgr::gen_path($dir); my $csv = clgr::read_csv($dir); my $data = clgr::cluster($csv); my %stat = (); my $i = 0; my %c_no = (); for my $c (@$data) { $c_no{mod}{$i}++; for my $item (@$c){ $stat{$i}{$item->[1]}++; $c_no{org}{$item->[1]}++; } $i++; } my @stat_keys = sort keys %stat; my $error_no = 0; my $total = 0; for my $stat_key (@stat_keys) { my $cluster_max = 0; my $cluster_total = 0; my @original_stat_keys = sort keys %{$stat{$stat_key}}; for my $original_stat_key (@original_stat_keys) { $cluster_total += $stat{$stat_key}{$original_stat_key}; $cluster_max = $stat{$stat_key}{$original_stat_key} > $cluster_max ? $stat{$stat_key}{$original_stat_key} : $cluster_max; } $total += $cluster_total; $error_no += $cluster_total - $cluster_max; } my $orig_no_c = scalar keys %{$c_no{org}}; my $mod_no_c = scalar keys %{$c_no{mod}}; # 'Mat', 'U+ID', 'Total', 'Err', 'Err%', 'Orig No C', 'Mod No C', 'Diff b/w Orig & Mod No C' say join ",", $path[-3], $path[-1], $total, $error_no, sprintf("%.2f", $error_no / $total * 100), $orig_no_c, $mod_no_c, abs($orig_no_c - $mod_no_c); # warn Dumper($data, \%stat); # last; }
「日本古典籍字形データセットをかんたんに分類してくれるPythonスクリプト」の結果を修正してHTMLに再出力するスクリプト
#!/usr/bin/perl use 5.012; use strict; use warnings; use utf8; use Cwd; use FindBin; use Data::Dumper; require "$FindBin::Bin/clgr.pl"; my $now = time(); my $wd = Cwd::getcwd(); my @directories = clgr::scan_dir($wd); for my $dir (@directories) { my @path = clgr::gen_path($dir); my $html_file = "$path[-3]/$path[-3]_mod.html"; my $csv_file = "$path[-3]/$path[-3].csv"; if (not -e $html_file or (stat $html_file)[9] < $now) { open my $html_fh, ">", $html_file or die $html_file; print $html_fh clgr::regen_html_header(); } if (not -e $csv_file or (stat $csv_file)[9] < $now) { open my $csv_fh, ">", $csv_file or die $csv_file; print $csv_fh clgr::regen_csv_header(); } my $data = clgr::read_csv($dir); my $cluster = clgr::cluster($data); my $html = clgr::regen_html($path[-1], "$wd/$path[-3]", $cluster); my $csv = clgr::regen_csv($path[-1], $path[-3], $cluster); open my $html_fh, ">>", $html_file or die $html_file; print $html_fh $html; open my $csv_fh, ">>", $csv_file or die $csv_file; print $csv_fh $csv; }
clgr.pl
#!which perl package clgr; use strict; use warnings; use utf8; use File::Spec; use Image::Size qw/html_imgsize/; my @kana = ( 'U+3042', 'U+3044', 'U+3046', 'U+3048', 'U+304A', 'U+304B', 'U+304C', 'U+304D', 'U+304E', 'U+304F', 'U+3050', 'U+3051', 'U+3052', 'U+3053', 'U+3054', 'U+3055', 'U+3056', 'U+3057', 'U+3058', 'U+3059', 'U+305A', 'U+305B', 'U+305C', 'U+305D', 'U+305E', 'U+305F', 'U+3060', 'U+3061', 'U+3062', 'U+3064', 'U+3065', 'U+3066', 'U+3067', 'U+3068', 'U+3069', 'U+306A', 'U+306B', 'U+306C', 'U+306D', 'U+306E', 'U+306F', 'U+3070', 'U+3072', 'U+3073', 'U+3075', 'U+3076', 'U+3078', 'U+3079', 'U+307B', 'U+307C', 'U+307E', 'U+307F', 'U+3080', 'U+3081', 'U+3082', 'U+3084', 'U+3086', 'U+3088', 'U+3089', 'U+308A', 'U+308B', 'U+308C', 'U+308D', 'U+308F', 'U+3090', 'U+3091', 'U+3092', 'U+3093' ); sub scan_dir { my $dir = shift; $dir .= '/' if $dir !~ m!/\z!ms; my @dirs = (); opendir(my $dh, $dir) or die $dir; while (my $rdir = readdir($dh)) { next if $rdir =~ /\A\.+\z/ms; if (-d $dir . $rdir) { if (grep {$_ eq $rdir} @kana) { push @dirs, $dir . $rdir; } else { my @subdir = scan_dir($dir . $rdir); push @dirs, @subdir; } } } return @dirs; } sub read_csv { my $dir = shift; my @csv = (); $dir =~ m!/(U[^/]+)\z!ms; my $file = $1 . ".csv"; open my $fh, "$dir/$file" or die "$dir/$file"; my $head = 0; while(my $line = <$fh>) { next if not $head++; chomp $line; $line =~ tr/"//d; warn $& if $line =~ /[^\.,\/\+\_A-Za-z0-9]/; my @split = split ',', $line; if (not $split[2] and $split[0]) { $split[2] = $split[0]; } push @csv, \@split; } return \@csv; } sub cluster { my $csv = shift; my @cluster = (); for my $array (@$csv) { push @{$cluster[$array->[2]]}, [$array->[1], $array->[0], $array->[3]]; # $cluster{<mod cl>} = [<seq>, <org cl>, <path>] } @cluster = sort { scalar(@$b) <=> scalar(@$a) } map { [ sort { $a->[0] <=> $b->[0] } @$_ ] } grep { $_ } @cluster; return \@cluster; } sub regen_html_header { return <<EOF; <html><head><style>span.nobr{white-space:nowrap;}</style></head><body> EOF } sub regen_html { my $uni = shift; my $html_dir = shift; my $cluster = shift; my $html = "<h1>$uni</h1>\n"; my $i = 1; for my $c (@$cluster) { $html .= "<h2>Cluster $i: " . scalar(@$c) . " items</h2>\n"; $html .= "<p>"; for my $item (@$c) { my $img_loc = File::Spec->rel2abs($item->[2], $html_dir); my $size = html_imgsize($img_loc); $html .= qq(<span class="nobr"><img src="$item->[2]" $size>$item->[0]</span> ); } $i++; $html .= "</p>\n"; } return $html; } sub regen_csv_header { return join(',', 'mat', 'u', 'cluster', 'counts') . "\n"; } sub regen_csv { my $uni = shift; my $mat = shift; my $cluster = shift; my $csv = ''; my $i = 1; for my $c (@$cluster) { $csv .= join ',', $mat, $uni, $i, scalar @$c; $csv .= "\n"; $i++; } return $csv; } sub gen_path { my $basedir = shift; return File::Spec->splitdir($basedir); } 1;
日本古典籍字形データセットをかんたんに分類してくれるPythonスクリプト
#!/usr/bin/python3 # coding: utf-8 # # Usage: Run on the directory just above where the Dataset of PMJT Character Shapes # (http://codh.rois.ac.jp/char-shape/) is downloaded # from pathlib import Path from time import time from PIL import Image from pyclustering.cluster.xmeans import xmeans from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer ext = '.jpg' kana = [ 'U+3042', 'U+3044', 'U+3046', 'U+3048', 'U+304A', 'U+304B', 'U+304C', 'U+304D', 'U+304E', 'U+304F', 'U+3050', 'U+3051', 'U+3052', 'U+3053', 'U+3054', 'U+3055', 'U+3056', 'U+3057', 'U+3058', 'U+3059', 'U+305A', 'U+305B', 'U+305C', 'U+305D', 'U+305E', 'U+305F', 'U+3060', 'U+3061', 'U+3062', 'U+3064', 'U+3065', 'U+3066', 'U+3067', 'U+3068', 'U+3069', 'U+306A', 'U+306B', 'U+306C', 'U+306D', 'U+306E', 'U+306F', 'U+3070', 'U+3072', 'U+3073', 'U+3075', 'U+3076', 'U+3078', 'U+3079', 'U+307B', 'U+307C', 'U+307E', 'U+307F', 'U+3080', 'U+3081', 'U+3082', 'U+3084', 'U+3086', 'U+3088', 'U+3089', 'U+308A', 'U+308B', 'U+308C', 'U+308D', 'U+308F', 'U+3090', 'U+3091', 'U+3092', 'U+3093', 'U+309D', 'U+309E', 'U+30B5', 'U+30C4', 'U+30CB', 'U+30F6' ] basedir = Path.cwd() now = time() def detect_dirs(scandir=None)->list: ''' detecting which directories would be scanned, which should be listed in the kana list ''' if not scandir: scandir = Path(basedir) dirs = [] for component in scandir.iterdir(): if component.name in kana: dirs.append(component) elif component.is_dir(): dirs += detect_dirs(component) return dirs def scan_dir(scandir)->list: ''' scanning the size of images it will dig a single-level of sub-directories ''' if not scandir.is_dir(): return [] files = [] scan_base = scandir.parts[-1] for f in sorted(scandir.iterdir()): if f.is_dir(): for sf in sorted(f.iterdir()): if sf.name.endswith(ext) and sf.is_file(): files.append(is_img(sf)) elif f.name.endswith(ext) and f.is_file(): files.append(is_img(f)) return files def is_img(filename: str)->list: ''' detect whether it is image file or not ''' return [filename, img_size(filename)] def img_size(filename: str)->list: ''' return the size of images ''' img = Image.open(filename, 'r') return list(img.size) def calc_xmeans(files: list)->list: ''' calculate the x-kernels ''' sample = [] for f in files: sample.append(f[1]) if len(sample) < 15: amount = 1 elif len(sample) > 50: amount = 3 else: amount = 2 xm_c = kmeans_plusplus_initializer(sample, amount).initialize() xm_i = xmeans(sample, xm_c, ccore=True) # xm_i = xmeans(sample, xm_c, ccore=False) # Use this line on Darwin, and pray xm_i.process() clusters = xm_i.get_clusters() clgr = [] j = 0 for c in clusters: container = [] for i in c: container.append([j, i, files[i]]) clgr.append(container) j += 1 return clgr def export(data: list, directory)->None: ''' export to html and csv calculated classifications ''' export_csv = directory / (directory.parts[-1] + '.csv') export_html = Path(directory / '../../').resolve() / (directory.parts[-3] + '.html') if export_html.exists() and export_html.stat().st_mtime < now: with export_html.open(mode='w') as exh: exh.write('<html><head><style>span.nobr{white-space:nowrap;}</style></head><body>') with export_csv.open(mode='w') as exc: exc.write(','.join(['cluster no','seq','mod cluster no','file name']) + '\n') for c in data: for i in c: exc.write(','.join([str(i[0]), str(i[1]),'',\ str(item[2][0].relative_to(export_html.parent)) + '\n') with export_html.open(mode='a') as exh: exh.write('<h1>' + str(directory.parts[-1]) + '</h1>\n') exh.write('<p>The number of clusters: ' + str(len(data)) + '</p>\n') i = 0 for c in data: i += 1 exh.write('<h2>Cluster ' + str(i) + '</h2>\n') exh.write('<p>') for item in c: exh.write('<span class="nobr"><img src="' +\ str(item[2][0].relative_to(export_html.parent)) +\ '" width="' + str(item[2][1][0]) + '" height="' +\ str(item[2][1][1]) + '">' + str(item[1]) + '</span> ') exh.write('</p>\n') dirs = detect_dirs() for directory in sorted(dirs): dir_scan = scan_dir(directory) xm_clusters = calc_xmeans(files=dir_scan) export(data=xm_clusters, directory=directory)