All computer source code presented on this page, unless it includes attribution to another author, is provided by Ed Halley under the Artistic License. Use such code freely and without any expectation of support. I would like to know if you make anything cool with the code, or need questions answered.
python/
    bindings.py
    boards.py
    buzz.py
    caches.py
    cards.py
    constraints.py
    csql.py
    english.py
    getch.py
    getopts.py
    gizmos.py
    goals.py
    improv.py
    interpolations.py
    namespaces.py
    nihongo.py
    nodes.py
    octalplus.py
    patterns.py
    physics.py
    pids.py
    pieces.py
    quizzes.py
    recipes.py
    relays.py
    romaji.py
    ropen.py
    sheets.py
    stores.py
    strokes.py
    subscriptions.py
    svgbuild.py
    testing.py
    things.py
    timing.py
    ucsv.py
    useful.py
    uuid.py
    vectors.py
    weighted.py
java/
    CSVReader.java
    CSVWriter.java
    GlobFilenameFilter.java
    RegexFilenameFilter.java
    StringBufferOutputStream.java
    ThreadSet.java
    Throttle.java
    TracingThread.java
    Utf8ConsoleTest.java
    droid/
        ArrangeViewsTouchListener.java
        DownloadFileTask.java
perl/
    CVQM.pm
    Kana.pm
    Typo.pm
cxx/
    CCache.h
    equalish.cpp
Download Kana.pm
#!/usr/bin/perl

#----------------------------------------------------------------------------

package Kana;

use warnings;
use strict;
use utf8;

use Exporter;
our @ISA = qw(Exporter);
our @EXPORT = qw(kana);
our @EXPORT_OK = qw();
our %EXPORT_TAGS = ( 'all' => [ @EXPORT, @EXPORT_OK ] );

#----------------------------------------------------------------------------

our $Encoding = 'utf8';

my %Kana;
my @Kana;
my %Irregulars;

sub _prepare;
{
    return if %Kana;

    # define hiragana and punctuation table first
    %Kana =
        ( '--' => "30FC", # hold dash
          '``' => "309B", # raised marks
          '^'  => "309C", # raised circle
          '~'  => "301C", # wave dash
          '['  => "300E", # left black corner bracket
          ']'  => "300F", # right black corner bracket
          '{'  => "3010", # left black lenticular bracket
          '}'  => "3011", # right black lenticular bracket
          '('  => "3016", # left white lenticular bracket
          ')'  => "3017", # right white lenticular bracket
          ','  => "3002", # ideographic comma
          '.'  => "3002", # ideographic full stop
          '-' => '',
          
          'chu' => 'chi(yu)', 'cha' => 'chi(ya)', 'cho' => 'chi(yo)',
          'shu' => 'shi(yu)', 'sha' => 'shi(ya)', 'sho' => 'shi(yo)',
          'kyu' => 'ki(yu)', 'kya' => 'ki(ya)', 'kyo' => 'ki(yo)', 
          'ryu' => 'ri(yu)', 'rya' => 'ri(ya)', 'ryo' => 'ri(yo)',

          'kka' => '(tsu)ka',
          'kki' => '(tsu)ki',
          'kku' => '(tsu)ku',
          'kke' => '(tsu)ke',
          'kko' => '(tsu)ko',

          'tta' => '(tsu)ta',
          'tti' => '(tsu)chi', 'tchi' => '(tsu)chi',
          'ttu' => '(tsu)tu',
          'tte' => '(tsu)te',
          'tto' => '(tsu)to',

          'ppa' => '(tsu)pa',
          'ppi' => '(tsu)pi',
          'ppu' => '(tsu)pu',
          'ppe' => '(tsu)pe',
          'ppo' => '(tsu)po',

          'ssha' => '(tsu)shi(ya)',
          'cchi' => '(tsu)chi',
          'syo' => 'shi(yo)',

          'ju' => 'ji(yu)', 'ja' => 'ji(ya)',
          'jo' => 'ji(yo)', 'je' => 'ji(e)',

          'a' => "3042", '(a)' => "3041",
          'i' => "3044", '(i)' => "3043",
          'u' => "3046", '(u)' => "3045",
          'e' => "3048", '(e)' => "3047",
          'o' => "304A", '(o)' => "3049",

          'ka' => "304B", 'ga' => "304C",
          'ki' => "304D", 'gi' => "304E",
          'ku' => "304F", 'gu' => "3050",
          'ke' => "3051", 'ge' => "3052",
          'ko' => "3053", 'go' => "3054",

          'sa' => "3055", 'za' => "3056",
          'shi' => "3057", 'ji' => "3058",
          'su' => "3059", 'zu' => "305A",
          'se' => "305B", 'ze' => "305C",
          'so' => "305D", 'zo' => "305E",

          'ta' => "305F", 'da' => "3060",
          'chi' => "3061", 'di' => "3062", 'ti' => "3061", 
          'tsu' => "3064", 'du' => "3065", '(tsu)' => "3063",
          'te' => "3066", 'de' => "3067",
          'to' => "3068", 'do' => "3069",

          'na' => "306A",
          'ni' => "306B",
          'nu' => "306C",
          'ne' => "306D",
          'no' => "306E",

          'ha' => "306F", 'ba' => "3070", 'pa' => "3071",
          'hi' => "3072", 'bi' => "3073", 'pi' => "3074",
          'hu' => "3075", 'bu' => "3076", 'pu' => "3077", 'fu' => "3075",
          'he' => "3078", 'be' => "3079", 'pe' => "307A",
          'ho' => "307B", 'bo' => "307C", 'po' => "307D",

          'ma' => "307E",
          'mi' => "307F",
          'mu' => "3080",
          'me' => "3081",
          'mo' => "3082",

          'ya' => "3084", '(ya)' => "3083",
          'yu' => "3086", '(yu)' => "3085",
          'yo' => "3088", '(yo)' => "3087",
        
          'ra' => "3089",
          'ri' => "308A",
          'ru' => "308B",
          're' => "308C",
          'ro' => "308D",

          'wa' => "308F",
          'wi' => "3090",
          'we' => "3091",
          'wo' => "3092",

          'n' => "3093", 'm' => "3093",
        );

    # repeat whole hiragana into katakana
    for (keys %Kana)
    {
        next if uc($_) eq $_;
        if ($Kana{$_} =~ /^3[0-9A-F]+$/)
        { $Kana{uc($_)} = sprintf("%X", hex($Kana{$_}) + 96); }
        else
        { $Kana{uc($_)} = uc($Kana{$_}); }
    }

    @Kana =
        map { $_->[0] }
        sort { $b->[1] <=> $a->[1] }
        map { [ $_, length($_) ] }
        keys %Kana;

    %Irregulars =
        ( 'konbanwa' => 'konbanha',
          'konnichiwa' => 'konnichiha',
          'dewa' => 'deha',
          'wa' => 'ha',
          'o' => 'wo',
        );
}

sub _encoded
{
    return '&#x' . $_[0] . ';' if $Encoding eq 'sgml';
    return chr( hex($_[0]) ) if $Encoding eq 'utf8';
    return '[?]';
}

#----------------------------------------------------------------------------

sub kana
{
    _prepare if not %Kana;

    my $string = shift;
    my $output = "";

    $string =~ s/\b\Q$_\E\b/$Irregulars{$_}/g
        for keys %Irregulars;

    my $max = length($Kana[0]);
    STRING: while ($string)
    {
        for (reverse 1..$max)
        {
            next if length($string) < $_;
            if (exists $Kana{substr($string, 0, $_)})
            {
                my $kana = $Kana{substr($string, 0, $_)};
                if ($kana =~ /^3[0-9A-F]+$/)
                { $output .= _encoded($kana); }
                else
                { $output .= kana($kana); }
                $string = substr($string, $_);
                next STRING;
            }
        }

        $output .= substr($string, 0, 1);
        $string = substr($string, 1);
    }

    $output =~ s/\s+//g;
    return $output;
}

sub test
{
    my $phrase = "konnichiwa. KO--HI-- wa tabemono dewa arimasen.";

    local $Kana::Encoding = 'sgml';
    my $encoded = Kana::kana($phrase);

    my $expected =
        "&#x3053;&#x3093;&#x306B;&#x3061;&#x306F;&#x3002;&#x30B3;&#x30FC;" .
        "&#x30D2;&#x30FC;&#x306F;&#x305F;&#x3079;&#x3082;&#x306E;&#x3067;" .
        "&#x306F;&#x3042;&#x308A;&#x307E;&#x305B;&#x3093;&#x3002;";

    die "Did not receive accurate results when trying sgml encoding"
        if $encoded ne $expected;
}

1;



Contact Ed Halley by email at ed@halley.cc.
Text, code, layout and artwork are Copyright © 1996-2013 Ed Halley.
Copying in whole or in part, with author attribution, is expressly allowed.
Any references to trademarks are illustrative and are controlled by their respective owners.
Make donations with PayPal - it's fast, free and secure!