path: root/misc/count.py



#!/usr/bin/env python
# Distributed under the "here, have it" license
# Written by Greg Hellings, all rights reserved

# Counts all the characters in a file, assumes UTF-8 encoding, and
# reports the frequency of each character as well as the Unicode
# character name for that code point. Can accept an arbitrary number
# of files on the argument line and will report the aggregate across
# each file. Can also accept input from stdin. If you want to mix
# stdin with files pass the filename '-' on the argument line.

import fileinput
from unicodedata import name
from operator import itemgetter

def sort_dict(adic):
	items = adic.items()
	items.sort()
	return [value for key, value in items]

chars = dict()
for line in fileinput.input():
	for c in line.decode('utf-8'):
		if not chars.has_key(c):
			chars[c] = 1
		else:
			chars[c] += 1

items = chars.items()
items.sort(key=itemgetter(1), reverse=True)
print 'Code point\tCharacter\tName\t\tCount'
for key, val in items:
	try:
		n = name(key)
	except:
		n = 'not found'
	print '%06X\t\t%s\t%24s %s' % (ord(key), key, n, val)