kzhr's diary

ad ponendum

「日本古典籍字形データセットをかんたんに分類してくれるPythonスクリプト」の結果を(手で)修正した量の統計を出すスクリプト

#!/usr/bin/perl

use 5.012;

use strict;
use warnings;
use utf8;

use Cwd;
use FindBin;

use Data::Dumper;

require "$FindBin::Bin/clgr.pl";

my $wd = Cwd::getcwd();
my @directories = clgr::scan_dir($wd);

say join ",", 'Mat', 'U+ID', 'Total', 'Err', 'Err%', 'Orig No C', 'Mod No C', 'Diff';

for my $dir (@directories) {
  my @path = clgr::gen_path($dir);
  my $csv = clgr::read_csv($dir);
  my $data = clgr::cluster($csv);

  my %stat = ();
  my $i = 0;
  my %c_no = ();

  for my $c (@$data) {
    $c_no{mod}{$i}++;
    for my $item (@$c){
      $stat{$i}{$item->[1]}++;
      $c_no{org}{$item->[1]}++;
    }
    $i++;
  }
  my @stat_keys = sort keys %stat;
  my $error_no = 0;
  my $total = 0;

  for my $stat_key (@stat_keys) {
    my $cluster_max = 0;
    my $cluster_total = 0;
    my @original_stat_keys = sort keys %{$stat{$stat_key}};
    for my $original_stat_key (@original_stat_keys) {
      $cluster_total += $stat{$stat_key}{$original_stat_key};
      $cluster_max = $stat{$stat_key}{$original_stat_key} > $cluster_max ? $stat{$stat_key}{$original_stat_key} : $cluster_max;
    }
    $total += $cluster_total;
    $error_no += $cluster_total - $cluster_max;
  }

  my $orig_no_c = scalar keys %{$c_no{org}};
  my $mod_no_c = scalar keys %{$c_no{mod}};
  # 'Mat', 'U+ID', 'Total', 'Err', 'Err%', 'Orig No C', 'Mod No C', 'Diff b/w Orig & Mod No C'
  say join ",", $path[-3], $path[-1], $total, $error_no, sprintf("%.2f", $error_no / $total * 100), $orig_no_c, $mod_no_c, abs($orig_no_c - $mod_no_c);
#  warn Dumper($data, \%stat);
#  last;
}