日本古典籍字形データセットをかんたんに分類してくれるPythonスクリプト

#!/usr/bin/python3
# coding: utf-8

#
# Usage: Run on the directory just above where the Dataset of PMJT Character Shapes
# (http://codh.rois.ac.jp/char-shape/) is downloaded
#

from pathlib import Path
from time import time

from PIL import Image
from pyclustering.cluster.xmeans import xmeans
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer

ext = '.jpg'

kana = [
    'U+3042', 'U+3044', 'U+3046', 'U+3048', 'U+304A', 'U+304B',
    'U+304C', 'U+304D', 'U+304E', 'U+304F', 'U+3050', 'U+3051',
    'U+3052', 'U+3053', 'U+3054', 'U+3055', 'U+3056', 'U+3057',
    'U+3058', 'U+3059', 'U+305A', 'U+305B', 'U+305C', 'U+305D',
    'U+305E', 'U+305F', 'U+3060', 'U+3061', 'U+3062', 'U+3064',
    'U+3065', 'U+3066', 'U+3067', 'U+3068', 'U+3069', 'U+306A',
    'U+306B', 'U+306C', 'U+306D', 'U+306E', 'U+306F', 'U+3070',
    'U+3072', 'U+3073', 'U+3075', 'U+3076', 'U+3078', 'U+3079',
    'U+307B', 'U+307C', 'U+307E', 'U+307F', 'U+3080', 'U+3081',
    'U+3082', 'U+3084', 'U+3086', 'U+3088', 'U+3089', 'U+308A',
    'U+308B', 'U+308C', 'U+308D', 'U+308F', 'U+3090', 'U+3091',
    'U+3092', 'U+3093', 'U+309D', 'U+309E', 'U+30B5', 'U+30C4',
    'U+30CB', 'U+30F6'
        ]

basedir = Path.cwd()
now = time()

def detect_dirs(scandir=None)->list:
    '''
    detecting which directories would be scanned,
    which should be listed in the kana list
    '''
    if not scandir:
        scandir = Path(basedir)
    dirs = []
    for component in scandir.iterdir():
        if component.name in kana:
            dirs.append(component)
        elif component.is_dir():
            dirs += detect_dirs(component)
    return dirs

def scan_dir(scandir)->list:
    '''
    scanning the size of images
    it will dig a single-level of sub-directories
    '''
    if not scandir.is_dir():
        return []
    files = []
    scan_base = scandir.parts[-1]

    for f in sorted(scandir.iterdir()):
        if f.is_dir():
            for sf in sorted(f.iterdir()):
                if sf.name.endswith(ext) and sf.is_file():
                    files.append(is_img(sf))
        elif f.name.endswith(ext) and f.is_file():
            files.append(is_img(f))

    return files

def is_img(filename: str)->list:
    '''
    detect whether it is image file or not
    '''
    return [filename, img_size(filename)]

def img_size(filename: str)->list:
    '''
    return the size of images
    '''
    img = Image.open(filename, 'r')
    return list(img.size)

def calc_xmeans(files: list)->list:
    '''
    calculate the x-kernels
    '''
    sample = []
    for f in files:
        sample.append(f[1])

    if len(sample) < 15:
        amount = 1
    elif len(sample) > 50:
        amount = 3
    else:
        amount = 2

    xm_c = kmeans_plusplus_initializer(sample, amount).initialize()
    xm_i = xmeans(sample, xm_c, ccore=True)
#    xm_i = xmeans(sample, xm_c, ccore=False)    # Use this line on Darwin, and pray

    xm_i.process()
    clusters = xm_i.get_clusters()

    clgr = []

    j = 0
    for c in clusters:
        container = []
        for i in c:
            container.append([j, i, files[i]])
        clgr.append(container)
        j += 1

    return clgr

def export(data: list, directory)->None:
    '''
    export to html and csv calculated classifications
    '''
    export_csv = directory / (directory.parts[-1] + '.csv')
    export_html = Path(directory / '../../').resolve() / (directory.parts[-3] + '.html')
    if export_html.exists() and export_html.stat().st_mtime < now:
        with export_html.open(mode='w') as exh:
            exh.write('<html><head><style>span.nobr{white-space:nowrap;}</style></head><body>')

    with export_csv.open(mode='w') as exc:
        exc.write(','.join(['cluster no','seq','mod cluster no','file name']) + '\n')
        for c in data:
            for i in c:
                exc.write(','.join([str(i[0]), str(i[1]),'',\
                        str(item[2][0].relative_to(export_html.parent)) + '\n')

    with export_html.open(mode='a') as exh:
        exh.write('<h1>' + str(directory.parts[-1]) + '</h1>\n')
        exh.write('<p>The number of clusters: ' + str(len(data)) + '</p>\n')
        i = 0
        for c in data:
            i += 1
            exh.write('<h2>Cluster ' + str(i) + '</h2>\n')
            exh.write('<p>')
            for item in c:
                exh.write('<span class="nobr"><img src="' +\
                        str(item[2][0].relative_to(export_html.parent)) +\
                        '" width="' + str(item[2][1][0]) + '" height="' +\
                        str(item[2][1][1]) + '">' + str(item[1]) + '</span> ')
            exh.write('</p>\n')

dirs = detect_dirs()

for directory in sorted(dirs):
    dir_scan = scan_dir(directory)
    xm_clusters = calc_xmeans(files=dir_scan)
    export(data=xm_clusters, directory=directory)
kzhr's diary

ad ponendum

日本古典籍字形データセットをかんたんに分類してくれるPythonスクリプト