「日本古典籍字形データセットをかんたんに分類してくれるPythonスクリプト」の結果を修正してHTMLに再出力するスクリプト
#!/usr/bin/perl use 5.012; use strict; use warnings; use utf8; use Cwd; use FindBin; use Data::Dumper; require "$FindBin::Bin/clgr.pl"; my $now = time(); my $wd = Cwd::getcwd(); my @directories = clgr::scan_dir($wd); for my $dir (@directories) { my @path = clgr::gen_path($dir); my $html_file = "$path[-3]/$path[-3]_mod.html"; my $csv_file = "$path[-3]/$path[-3].csv"; if (not -e $html_file or (stat $html_file)[9] < $now) { open my $html_fh, ">", $html_file or die $html_file; print $html_fh clgr::regen_html_header(); } if (not -e $csv_file or (stat $csv_file)[9] < $now) { open my $csv_fh, ">", $csv_file or die $csv_file; print $csv_fh clgr::regen_csv_header(); } my $data = clgr::read_csv($dir); my $cluster = clgr::cluster($data); my $html = clgr::regen_html($path[-1], "$wd/$path[-3]", $cluster); my $csv = clgr::regen_csv($path[-1], $path[-3], $cluster); open my $html_fh, ">>", $html_file or die $html_file; print $html_fh $html; open my $csv_fh, ">>", $csv_file or die $csv_file; print $csv_fh $csv; }
clgr.pl
#!which perl package clgr; use strict; use warnings; use utf8; use File::Spec; use Image::Size qw/html_imgsize/; my @kana = ( 'U+3042', 'U+3044', 'U+3046', 'U+3048', 'U+304A', 'U+304B', 'U+304C', 'U+304D', 'U+304E', 'U+304F', 'U+3050', 'U+3051', 'U+3052', 'U+3053', 'U+3054', 'U+3055', 'U+3056', 'U+3057', 'U+3058', 'U+3059', 'U+305A', 'U+305B', 'U+305C', 'U+305D', 'U+305E', 'U+305F', 'U+3060', 'U+3061', 'U+3062', 'U+3064', 'U+3065', 'U+3066', 'U+3067', 'U+3068', 'U+3069', 'U+306A', 'U+306B', 'U+306C', 'U+306D', 'U+306E', 'U+306F', 'U+3070', 'U+3072', 'U+3073', 'U+3075', 'U+3076', 'U+3078', 'U+3079', 'U+307B', 'U+307C', 'U+307E', 'U+307F', 'U+3080', 'U+3081', 'U+3082', 'U+3084', 'U+3086', 'U+3088', 'U+3089', 'U+308A', 'U+308B', 'U+308C', 'U+308D', 'U+308F', 'U+3090', 'U+3091', 'U+3092', 'U+3093' ); sub scan_dir { my $dir = shift; $dir .= '/' if $dir !~ m!/\z!ms; my @dirs = (); opendir(my $dh, $dir) or die $dir; while (my $rdir = readdir($dh)) { next if $rdir =~ /\A\.+\z/ms; if (-d $dir . $rdir) { if (grep {$_ eq $rdir} @kana) { push @dirs, $dir . $rdir; } else { my @subdir = scan_dir($dir . $rdir); push @dirs, @subdir; } } } return @dirs; } sub read_csv { my $dir = shift; my @csv = (); $dir =~ m!/(U[^/]+)\z!ms; my $file = $1 . ".csv"; open my $fh, "$dir/$file" or die "$dir/$file"; my $head = 0; while(my $line = <$fh>) { next if not $head++; chomp $line; $line =~ tr/"//d; warn $& if $line =~ /[^\.,\/\+\_A-Za-z0-9]/; my @split = split ',', $line; if (not $split[2] and $split[0]) { $split[2] = $split[0]; } push @csv, \@split; } return \@csv; } sub cluster { my $csv = shift; my @cluster = (); for my $array (@$csv) { push @{$cluster[$array->[2]]}, [$array->[1], $array->[0], $array->[3]]; # $cluster{<mod cl>} = [<seq>, <org cl>, <path>] } @cluster = sort { scalar(@$b) <=> scalar(@$a) } map { [ sort { $a->[0] <=> $b->[0] } @$_ ] } grep { $_ } @cluster; return \@cluster; } sub regen_html_header { return <<EOF; <html><head><style>span.nobr{white-space:nowrap;}</style></head><body> EOF } sub regen_html { my $uni = shift; my $html_dir = shift; my $cluster = shift; my $html = "<h1>$uni</h1>\n"; my $i = 1; for my $c (@$cluster) { $html .= "<h2>Cluster $i: " . scalar(@$c) . " items</h2>\n"; $html .= "<p>"; for my $item (@$c) { my $img_loc = File::Spec->rel2abs($item->[2], $html_dir); my $size = html_imgsize($img_loc); $html .= qq(<span class="nobr"><img src="$item->[2]" $size>$item->[0]</span> ); } $i++; $html .= "</p>\n"; } return $html; } sub regen_csv_header { return join(',', 'mat', 'u', 'cluster', 'counts') . "\n"; } sub regen_csv { my $uni = shift; my $mat = shift; my $cluster = shift; my $csv = ''; my $i = 1; for my $c (@$cluster) { $csv .= join ',', $mat, $uni, $i, scalar @$c; $csv .= "\n"; $i++; } return $csv; } sub gen_path { my $basedir = shift; return File::Spec->splitdir($basedir); } 1;