blob: 648c933293ae122701a7d3fd61ad9f226d6346af (
plain) (
tree)
|
|
#!/usr/bin/env python
# Distributed under the "here, have it" license
# Written by Greg Hellings, all rights reserved
# Counts all the characters in a file, assumes UTF-8 encoding, and
# reports the frequency of each character as well as the Unicode
# character name for that code point. Can accept an arbitrary number
# of files on the argument line and will report the aggregate across
# each file. Can also accept input from stdin. If you want to mix
# stdin with files pass the filename '-' on the argument line.
import fileinput
from unicodedata import name
from operator import itemgetter
def sort_dict(adic):
items = adic.items()
items.sort()
return [value for key, value in items]
chars = dict()
for line in fileinput.input():
for c in line.decode('utf-8'):
if not chars.has_key(c):
chars[c] = 1
else:
chars[c] += 1
items = chars.items()
items.sort(key=itemgetter(1), reverse=True)
print 'Code point\tCharacter\tName\t\tCount'
for key, val in items:
try:
n = name(key)
except:
n = 'not found'
print '%06X\t\t%s\t%24s %s' % (ord(key), key, n, val)
|