from pathlib import Path
from time import time
from PIL import Image
from pyclustering.cluster.xmeans import xmeans
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
ext = '.jpg'
kana = [
'U+3042', 'U+3044', 'U+3046', 'U+3048', 'U+304A', 'U+304B',
'U+304C', 'U+304D', 'U+304E', 'U+304F', 'U+3050', 'U+3051',
'U+3052', 'U+3053', 'U+3054', 'U+3055', 'U+3056', 'U+3057',
'U+3058', 'U+3059', 'U+305A', 'U+305B', 'U+305C', 'U+305D',
'U+305E', 'U+305F', 'U+3060', 'U+3061', 'U+3062', 'U+3064',
'U+3065', 'U+3066', 'U+3067', 'U+3068', 'U+3069', 'U+306A',
'U+306B', 'U+306C', 'U+306D', 'U+306E', 'U+306F', 'U+3070',
'U+3072', 'U+3073', 'U+3075', 'U+3076', 'U+3078', 'U+3079',
'U+307B', 'U+307C', 'U+307E', 'U+307F', 'U+3080', 'U+3081',
'U+3082', 'U+3084', 'U+3086', 'U+3088', 'U+3089', 'U+308A',
'U+308B', 'U+308C', 'U+308D', 'U+308F', 'U+3090', 'U+3091',
'U+3092', 'U+3093', 'U+309D', 'U+309E', 'U+30B5', 'U+30C4',
'U+30CB', 'U+30F6'
]
basedir = Path.cwd()
now = time()
def detect_dirs(scandir=None)->list:
'''
detecting which directories would be scanned,
which should be listed in the kana list
'''
if not scandir:
scandir = Path(basedir)
dirs = []
for component in scandir.iterdir():
if component.name in kana:
dirs.append(component)
elif component.is_dir():
dirs += detect_dirs(component)
return dirs
def scan_dir(scandir)->list:
'''
scanning the size of images
it will dig a single-level of sub-directories
'''
if not scandir.is_dir():
return []
files = []
scan_base = scandir.parts[-1]
for f in sorted(scandir.iterdir()):
if f.is_dir():
for sf in sorted(f.iterdir()):
if sf.name.endswith(ext) and sf.is_file():
files.append(is_img(sf))
elif f.name.endswith(ext) and f.is_file():
files.append(is_img(f))
return files
def is_img(filename: str)->list:
'''
detect whether it is image file or not
'''
return [filename, img_size(filename)]
def img_size(filename: str)->list:
'''
return the size of images
'''
img = Image.open(filename, 'r')
return list(img.size)
def calc_xmeans(files: list)->list:
'''
calculate the x-kernels
'''
sample = []
for f in files:
sample.append(f[1])
if len(sample) < 15:
amount = 1
elif len(sample) > 50:
amount = 3
else:
amount = 2
xm_c = kmeans_plusplus_initializer(sample, amount).initialize()
xm_i = xmeans(sample, xm_c, ccore=True)
xm_i.process()
clusters = xm_i.get_clusters()
clgr = []
j = 0
for c in clusters:
container = []
for i in c:
container.append([j, i, files[i]])
clgr.append(container)
j += 1
return clgr
def export(data: list, directory)->None:
'''
export to html and csv calculated classifications
'''
export_csv = directory / (directory.parts[-1] + '.csv')
export_html = Path(directory / '../../').resolve() / (directory.parts[-3] + '.html')
if export_html.exists() and export_html.stat().st_mtime < now:
with export_html.open(mode='w') as exh:
exh.write('<html><head><style>span.nobr{white-space:nowrap;}</style></head><body>')
with export_csv.open(mode='w') as exc:
exc.write(','.join(['cluster no','seq','mod cluster no','file name']) + '\n')
for c in data:
for i in c:
exc.write(','.join([str(i[0]), str(i[1]),'',\
str(item[2][0].relative_to(export_html.parent)) + '\n')
with export_html.open(mode='a') as exh:
exh.write('<h1>' + str(directory.parts[-1]) + '</h1>\n')
exh.write('<p>The number of clusters: ' + str(len(data)) + '</p>\n')
i = 0
for c in data:
i += 1
exh.write('<h2>Cluster ' + str(i) + '</h2>\n')
exh.write('<p>')
for item in c:
exh.write('<span class="nobr"><img src="' +\
str(item[2][0].relative_to(export_html.parent)) +\
'" width="' + str(item[2][1][0]) + '" height="' +\
str(item[2][1][1]) + '">' + str(item[1]) + '</span> ')
exh.write('</p>\n')
dirs = detect_dirs()
for directory in sorted(dirs):
dir_scan = scan_dir(directory)
xm_clusters = calc_xmeans(files=dir_scan)
export(data=xm_clusters, directory=directory)