構造化テキストのエスケープ

こんな風に処理すればいいのかな

中心となる部分は http://developer.cybozu.co.jp/kazuho/2010/09/twitter-xss-f73.html から拝借
入ってきたテキストが中途半端にエスケープされている( http://d.hatena.ne.jp/koseki2/20100414/twitterEscape )ため、入ってくるときに一度 decode_entities() をかける
分割された $token 及びそれから派生するデータのみに encode_entities() をかける(コード中のものはエスケープしない)

もっと改良の余地はありそうだが…ぐぬぬ

use common::sense;
use HTML::Entities;

our %re = (
    uri   => qr{(http://[\S]+)}, # FIXME
    reply => qr{(\@[0-9A-Za-z_]+)},
    hash  => qr{(\#[0-9A-Za-z_]+)},
);

our $regex = qr/$re{uri}|$re{reply}|$re{hash}/;

my @tweets = (
    "See: http://x.xx/@\"style=\"color:pink\"onmouseover=alert(1)//",
    q{http://j.mp/dankogai @dankogai こうですか? #XSS わかりません <script>alert('XSS')</script>
http://twitter.com/search?q=a&r=b#@"onmouseover="alert(location.href)"/},
    q{http://j.mp/dankogai @dankogai こうですか? #XSS わかりません <script>alert('XSS')</script>
http://twitter.com/search?q=a&r=b#@"onmouseover="alert(location.href)"/ twitpic! http://twitpic.com/2r2umf},
);

for my $tweet (@tweets) {
    say '<div>';
    say make_link($tweet);
    say '</div>';
}

sub make_link {
    my $text = decode_entities(shift);

    my $html = '';
    for my $token (split $regex, $text) {
        if ($token =~ /^$re{uri}/) {
            if ($token =~ m!http://twitpic\.com/(\w+)!) {
                my $encoded = encode_entities($1);
                $html .= qq{
                    <a href="http://twitpic.com/$encoded">
                        <img src="http://twitpic.com/show/thumb/$encoded"></a>};

            } else {
                my $encoded = encode_entities($token);
                $html .= qq{<a href="$encoded">$encoded</a>};
            }

        } elsif ($token =~ m!^\#(.+)$!) {
            my $hash_ent = encode_entities($1);
            my $hash_tag = encode_entities($token);
            $html .= qq{<a href="http://search.twitter.com/search?q=%23$hash_ent">$hash_tag</a>};

        } elsif ($token =~ m!^\@(.+)$!) {
            my $user = encode_entities($1);
            $html .= qq{<a href="http://twitter.com/$user">\@$user</a>};

        } else {
            $html .= encode_entities($token);

        }
    }

    return $html;
}

exit;

__END__