kzhr's diary

ad ponendum


use strict;
use warnings;
use utf8;

sub ids_split {
	my $text = shift;
	my @chars = split //, $text;
	my @split;
	my $buffer_text = '';
	my $buffer_num = 0;
	for my $char (@chars) {
		if ($char =~ /[⿰⿱⿴⿵⿶⿷⿸⿹⿺⿻]/) {
			$buffer_num = $buffer_num ? $buffer_num + 2 : $buffer_num + 3;
		elsif ($char =~ /[⿲⿳]/) {
			$buffer_num += $buffer_num ? $buffer_num + 3 : $buffer_num + 4;

		if ($buffer_num) {
			$buffer_text .= $char;
			$buffer_num -= 1;
			next if $buffer_num;
			push @split, $buffer_text;
			$buffer_text = '';
		else {
			push @split, $char;
	return @split;



たとへば、国会会議録検索システム -FAQ-など。臨時仮名遣調査委員会議事速記録 - 国立国会図書館デジタルコレクションにあるが、文部大臣官房図書課だけの著作権を認めてよいのかはよく分からないところなので(あるいは個々に調べてあるのかもしれないが、DBには載ってないので)、調べてみる。1967年までに全員没してゐればそれで当面はよいといふことになる。


  1. 牧野伸顕(文部大臣)†1949
  2. 菊池大麓(委員長)†1917
  3. 曽我祐準(委員)†1935
  4. 松平正直(委員)†1915
  5. 浅田徳則(委員)†1933
  6. 岡部長職(委員)†1925
  7. 矢野文雄(委員)†1931
  8. 森林太郎(委員)†1922
  9. 岡野敬次郎(委員)†1925
  10. 伊知地彦次郎(委員)*1†1912
  11. 伊沢修二(委員)†1917
  12. 芳賀矢一(委員)†1927
  13. 藤岡好古(委員)†1917
  14. 大槻文彦(委員)†1928
  15. 江原素六(委員)†1922
  16. 三宅雄二(次)郎(委員)†1945
  17. 渡部董之介(番外*2)†1938


  1. 井上毅(附録)†1895
  2. 米国大使館ドッジ氏(附録)不明


  1. 小牧昌業
  2. 山川健次郎
  3. 小松謙二郎
  4. 井上哲次郎
  5. 上田萬年
  6. 徳富猪一郎
  7. 横井時雄
  8. 松村茂助
  9. 島田三郎
  10. 鎌田栄吉
  11. 板根友敬
  12. 土舘長言
  13. 肥塚籠





use 5.012;

use strict;
use warnings;
use utf8;

use Cwd;
use FindBin;

use Data::Dumper;

require "$FindBin::Bin/";

my $wd = Cwd::getcwd();
my @directories = clgr::scan_dir($wd);

say join ",", 'Mat', 'U+ID', 'Total', 'Err', 'Err%', 'Orig No C', 'Mod No C', 'Diff';

for my $dir (@directories) {
  my @path = clgr::gen_path($dir);
  my $csv = clgr::read_csv($dir);
  my $data = clgr::cluster($csv);

  my %stat = ();
  my $i = 0;
  my %c_no = ();

  for my $c (@$data) {
    for my $item (@$c){
  my @stat_keys = sort keys %stat;
  my $error_no = 0;
  my $total = 0;

  for my $stat_key (@stat_keys) {
    my $cluster_max = 0;
    my $cluster_total = 0;
    my @original_stat_keys = sort keys %{$stat{$stat_key}};
    for my $original_stat_key (@original_stat_keys) {
      $cluster_total += $stat{$stat_key}{$original_stat_key};
      $cluster_max = $stat{$stat_key}{$original_stat_key} > $cluster_max ? $stat{$stat_key}{$original_stat_key} : $cluster_max;
    $total += $cluster_total;
    $error_no += $cluster_total - $cluster_max;

  my $orig_no_c = scalar keys %{$c_no{org}};
  my $mod_no_c = scalar keys %{$c_no{mod}};
  # 'Mat', 'U+ID', 'Total', 'Err', 'Err%', 'Orig No C', 'Mod No C', 'Diff b/w Orig & Mod No C'
  say join ",", $path[-3], $path[-1], $total, $error_no, sprintf("%.2f", $error_no / $total * 100), $orig_no_c, $mod_no_c, abs($orig_no_c - $mod_no_c);
#  warn Dumper($data, \%stat);
#  last;



use 5.012;

use strict;
use warnings;
use utf8;

use Cwd;
use FindBin;

use Data::Dumper;

require "$FindBin::Bin/";

my $now = time();

my $wd = Cwd::getcwd();
my @directories = clgr::scan_dir($wd);

for my $dir (@directories) {
    my @path = clgr::gen_path($dir);
    my $html_file = "$path[-3]/$path[-3]_mod.html";
    my $csv_file = "$path[-3]/$path[-3].csv";
    if (not -e $html_file or (stat $html_file)[9] < $now) {
        open my $html_fh, ">", $html_file or die $html_file;
        print $html_fh clgr::regen_html_header();
    if (not -e $csv_file or (stat $csv_file)[9] < $now) {
        open my $csv_fh, ">", $csv_file or die $csv_file;
        print $csv_fh clgr::regen_csv_header(); 

    my $data = clgr::read_csv($dir);
    my $cluster = clgr::cluster($data);
    my $html = clgr::regen_html($path[-1], "$wd/$path[-3]", $cluster);
    my $csv = clgr::regen_csv($path[-1], $path[-3], $cluster);
    open my $html_fh, ">>", $html_file or die $html_file;
    print $html_fh $html;
    open my $csv_fh, ">>", $csv_file or die $csv_file;
    print $csv_fh $csv;

#!which perl

package clgr;

use strict;
use warnings;
use utf8;
use File::Spec;
use Image::Size qw/html_imgsize/;

my @kana = (
    'U+3042', 'U+3044', 'U+3046', 'U+3048', 'U+304A', 'U+304B',
    'U+304C', 'U+304D', 'U+304E', 'U+304F', 'U+3050', 'U+3051',
    'U+3052', 'U+3053', 'U+3054', 'U+3055', 'U+3056', 'U+3057',
    'U+3058', 'U+3059', 'U+305A', 'U+305B', 'U+305C', 'U+305D',
    'U+305E', 'U+305F', 'U+3060', 'U+3061', 'U+3062', 'U+3064',
    'U+3065', 'U+3066', 'U+3067', 'U+3068', 'U+3069', 'U+306A',
    'U+306B', 'U+306C', 'U+306D', 'U+306E', 'U+306F', 'U+3070',
    'U+3072', 'U+3073', 'U+3075', 'U+3076', 'U+3078', 'U+3079',
    'U+307B', 'U+307C', 'U+307E', 'U+307F', 'U+3080', 'U+3081',
    'U+3082', 'U+3084', 'U+3086', 'U+3088', 'U+3089', 'U+308A',
    'U+308B', 'U+308C', 'U+308D', 'U+308F', 'U+3090', 'U+3091',
    'U+3092', 'U+3093'

sub scan_dir {
    my $dir = shift;
    $dir .= '/' if $dir !~ m!/\z!ms;
    my @dirs = ();
    opendir(my $dh, $dir) or die $dir;
    while (my $rdir = readdir($dh)) {
        next if $rdir =~ /\A\.+\z/ms;
        if (-d $dir . $rdir) {
            if (grep {$_ eq $rdir} @kana) {
                push @dirs, $dir . $rdir;
            else {
                my @subdir = scan_dir($dir . $rdir);
                push @dirs, @subdir;
    return @dirs;

sub read_csv {
    my $dir = shift;
    my @csv = ();
    $dir =~ m!/(U[^/]+)\z!ms;
    my $file = $1 . ".csv";
    open my $fh, "$dir/$file" or die "$dir/$file";
    my $head = 0;
    while(my $line = <$fh>) {
        next if not $head++;
        chomp $line;
        $line =~ tr/"//d;
        warn $& if $line =~ /[^\.,\/\+\_A-Za-z0-9]/;
        my @split = split ',', $line;
        if (not $split[2] and $split[0]) {
            $split[2] = $split[0];
        push @csv, \@split;
    return \@csv;

sub cluster {
    my $csv = shift;
    my @cluster = ();
    for my $array (@$csv) {
        push @{$cluster[$array->[2]]}, [$array->[1], $array->[0], $array->[3]]; # $cluster{<mod cl>} = [<seq>, <org cl>, <path>]
    @cluster = sort { scalar(@$b) <=> scalar(@$a) } map { [ sort { $a->[0] <=> $b->[0] } @$_ ] } grep { $_ } @cluster;
    return \@cluster;

sub regen_html_header {
    return <<EOF;

sub regen_html {
    my $uni = shift;
    my $html_dir = shift;
    my $cluster = shift;
    my $html = "<h1>$uni</h1>\n";
    my $i = 1;
    for my $c (@$cluster) {
        $html .= "<h2>Cluster $i: " . scalar(@$c) . " items</h2>\n";
        $html .= "<p>";
        for my $item (@$c) {
            my $img_loc = File::Spec->rel2abs($item->[2], $html_dir);
            my $size = html_imgsize($img_loc);
            $html .= qq(<span class="nobr"><img src="$item->[2]" $size>$item->[0]</span> );
        $html .= "</p>\n";
    return $html;

sub regen_csv_header {
    return join(',', 'mat', 'u', 'cluster', 'counts') . "\n";

sub regen_csv {
    my $uni = shift;
    my $mat = shift;
    my $cluster = shift;
    my $csv = '';
    my $i = 1;
    for my $c (@$cluster) {
        $csv .= join ',', $mat, $uni, $i, scalar @$c;
        $csv .= "\n";
    return $csv;

sub gen_path {
    my $basedir = shift;
    return File::Spec->splitdir($basedir);



# coding: utf-8

# Usage: Run on the directory just above where the Dataset of PMJT Character Shapes
# ( is downloaded

from pathlib import Path
from time import time

from PIL import Image
from pyclustering.cluster.xmeans import xmeans
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer

ext = '.jpg'

kana = [
    'U+3042', 'U+3044', 'U+3046', 'U+3048', 'U+304A', 'U+304B',
    'U+304C', 'U+304D', 'U+304E', 'U+304F', 'U+3050', 'U+3051',
    'U+3052', 'U+3053', 'U+3054', 'U+3055', 'U+3056', 'U+3057',
    'U+3058', 'U+3059', 'U+305A', 'U+305B', 'U+305C', 'U+305D',
    'U+305E', 'U+305F', 'U+3060', 'U+3061', 'U+3062', 'U+3064',
    'U+3065', 'U+3066', 'U+3067', 'U+3068', 'U+3069', 'U+306A',
    'U+306B', 'U+306C', 'U+306D', 'U+306E', 'U+306F', 'U+3070',
    'U+3072', 'U+3073', 'U+3075', 'U+3076', 'U+3078', 'U+3079',
    'U+307B', 'U+307C', 'U+307E', 'U+307F', 'U+3080', 'U+3081',
    'U+3082', 'U+3084', 'U+3086', 'U+3088', 'U+3089', 'U+308A',
    'U+308B', 'U+308C', 'U+308D', 'U+308F', 'U+3090', 'U+3091',
    'U+3092', 'U+3093', 'U+309D', 'U+309E', 'U+30B5', 'U+30C4',
    'U+30CB', 'U+30F6'

basedir = Path.cwd()
now = time()

def detect_dirs(scandir=None)->list:
    detecting which directories would be scanned,
    which should be listed in the kana list
    if not scandir:
        scandir = Path(basedir)
    dirs = []
    for component in scandir.iterdir():
        if in kana:
        elif component.is_dir():
            dirs += detect_dirs(component)
    return dirs

def scan_dir(scandir)->list:
    scanning the size of images
    it will dig a single-level of sub-directories
    if not scandir.is_dir():
        return []
    files = []
    scan_base =[-1]

    for f in sorted(scandir.iterdir()):
        if f.is_dir():
            for sf in sorted(f.iterdir()):
                if and sf.is_file():
        elif and f.is_file():

    return files

def is_img(filename: str)->list:
    detect whether it is image file or not
    return [filename, img_size(filename)]

def img_size(filename: str)->list:
    return the size of images
    img =, 'r')
    return list(img.size)

def calc_xmeans(files: list)->list:
    calculate the x-kernels
    sample = []
    for f in files:

    if len(sample) < 15:
        amount = 1
    elif len(sample) > 50:
        amount = 3
        amount = 2

    xm_c = kmeans_plusplus_initializer(sample, amount).initialize()
    xm_i = xmeans(sample, xm_c, ccore=True)
#    xm_i = xmeans(sample, xm_c, ccore=False)    # Use this line on Darwin, and pray

    clusters = xm_i.get_clusters()

    clgr = []

    j = 0
    for c in clusters:
        container = []
        for i in c:
            container.append([j, i, files[i]])
        j += 1

    return clgr

def export(data: list, directory)->None:
    export to html and csv calculated classifications
    export_csv = directory / ([-1] + '.csv')
    export_html = Path(directory / '../../').resolve() / ([-3] + '.html')
    if export_html.exists() and export_html.stat().st_mtime < now:
        with'w') as exh:

    with'w') as exc:
        exc.write(','.join(['cluster no','seq','mod cluster no','file name']) + '\n')
        for c in data:
            for i in c:
                exc.write(','.join([str(i[0]), str(i[1]),'',\
                        str(item[2][0].relative_to(export_html.parent)) + '\n')

    with'a') as exh:
        exh.write('<h1>' + str([-1]) + '</h1>\n')
        exh.write('<p>The number of clusters: ' + str(len(data)) + '</p>\n')
        i = 0
        for c in data:
            i += 1
            exh.write('<h2>Cluster ' + str(i) + '</h2>\n')
            for item in c:
                exh.write('<span class="nobr"><img src="' +\
                        str(item[2][0].relative_to(export_html.parent)) +\
                        '" width="' + str(item[2][1][0]) + '" height="' +\
                        str(item[2][1][1]) + '">' + str(item[1]) + '</span> ')

dirs = detect_dirs()

for directory in sorted(dirs):
    dir_scan = scan_dir(directory)
    xm_clusters = calc_xmeans(files=dir_scan)
    export(data=xm_clusters, directory=directory)


#coding: utf-8

import sys
from pptx import Presentation

if __name__ == '__main__':
  prs = Presentation(sys.argv[1])
  c = 1
  for slide in prs.slides:
    print("# Slide " + str(c))
    s = 1
    for shape in slide.shapes:
      if not shape.has_text_frame:
      if s == 1:
        print("## " + shape.text_frame.paragraphs[0].text + "\n")
        for paragraph in shape.text_frame.paragraphs:
         print("  " * (paragraph.level) + "*"),
      s = s + 1
    if slide.has_notes_slide:
      print(slide.notes_slide.notes_text_frame.text + "\n")
    c = c + 1
