#!/usr/bin/perl -w # # This is a test script used to explore encodings. # It generates a series of pages that can be sent to the browser # to test the transformations used by the TinyMCEPlugin. Specifically: # 1. Formulate page in a character encoding. The page contains a textarea # that contains text that makes sense in the selected encoding. # 2. Send that page to the client # Visually verify that the rendered page reflects the encoded text. # 3. Click the 'Run Test 1' button. Client compiles an XHR based on the # content of the textarea and POSTs it back to the server. # 4. Server (thi script) receives content and decodes it to a state where # TWiki could use it. This is done in the 'text2html' section. # 5. A suitably encoded response is sent to the client, which sets a # DIV.innerHTML with the response (this is how TMCE works) # 6. Client JS now compiles a second request, using the content of the DIV # (which is pure HTML). This is sent to the 'html2text' section. The # response is shown in the final textarea. # This is a true and accurate reflection of the process used for WYSIWYG. # use strict; use Encode; use CGI; use CGI::Carp qw(fatalsToBrowser); use File::Spec; use HTML::Entities; #use locale; my $htmlPageBody = <<'HERE';

Testing encoding transformations for TWiki

This is a test script used to debug encodings. It is a simulation of the character encoding transformations used by TinyMCEPlugin and WyswiygPlugin, and is intended to help the author of those plugins understand the problems. Others may also find it useful to help understand those plugins!

To use the script you must understand the following terms:

You are also advised to read the encodings help page on TWiki.org
To run the tests, first select an encoding. This is what is done when you set up TWiki to use a specific {Site}{CharSet}. The currently selected encoding used to display this page is %CHARSET%. The perl module Encode::encodings says the following encodings are available for selection: %CHARSETS%

New encoding

You should also select a range of characters that will be used to generate a "test topic". The range is specified using decimal code points. You can use this site to find the decimal code points for the characters you are interested in. Enter the code points of the lowest and the highest characters in the interesting range. Avoid ranges that include code points in the range 32 to 126, as some of these characters have a special meaning in HTML.
Code point range (decimal) ...


Test status

The following textarea is initially populated by the script by generating a string of characters based on the range you specified above, and then converting that string to the selected encoding. That means the content of the textarea is exactly what you will see in a standard TWiki edit textarea, given those characters in the topic. If you see any characters shown as \x{HHHH} it means that the character range you entered could not be fully encoded using the selected charset. This will not affect the later tests, and does not indicate an error (PEBKAC).

If you want, you can just type into the textarea and it will work on whatever you type.

Test 1 - Simulate TML to HTML REST handler

When you run test 1, the content of the textarea just above is URL encoded and sent to the server in an XHR. The server does whatever processing is specified for 'text2html' and sends the result back to the client. The client then adds it to the following DIV between the following hr's by setting .innerHTML Visually verify that the resulting characters ar the same as those in the first textarea.

Run Test 1 Warning - if you are using IE6, it is incredibly slow. be patient!



Test 2 - Simulate HTML to TML REST handler

In Test 2, the client sends the .innerHTML of the DIV built in Test 1 back to the server in a html2text request. The server does whatever processing is specified for 'html2text' and sends the result back to the client. The client adds this new text to the following textarea below.

Run Test 2


If everything is working for this charset you should see the same text in the first textarea, the DIV and the final textarea.

Test 3 - Form submission

The final test simulates form submission in TMCE. It submits a form containing the textarea that resulted from Test 3 back to the server. The result of that action is a refresh of this page, with the space between the following hr's filled in with the result of the submit. Visually check that this result is the same as the first textarea. Note that the textareas and div in tests 1 and 2 will be cleared.

This result is also written to a temporary file on the server, the name of which will be reported after the test. This file should be manually checked on the server to ensure it is correctly encoded using the selected encoding.


%SAVEDTEXT%
Saved file: %SAVEFILENAME%

HERE my $q = new CGI; my $action = $q->param('action') || 'update'; $TWiki::cfg{Site}{CharSet} = $q->param('charset'); $TWiki::cfg{Site}{CharSet} = 'iso-8859-1' unless defined $TWiki::cfg{Site}{CharSet}; if ($TWiki::cfg{Site}{CharSet} !~ /^iso-?8859-?1$/io) { CGI::charset($TWiki::cfg{Site}{CharSet}); } my $firstchar = $q->param('firstchar'); $firstchar = 32 unless defined $firstchar; my $lastchar = $q->param('lastchar'); $lastchar = 126 unless defined $lastchar; # Create a unicode string. This is stored by perl using wide characters. my @test; for (my $i = $firstchar; $i <= $lastchar; $i++) { push(@test, $i.":".chr($i)); } my $text = join(' ', @test)."."; # Convert the unicode string to the selected encoding. We use an FB_PERLQQ # to defuse a string unencodeable in the current # charset. my $encoded_text = Encode::encode($TWiki::cfg{Site}{CharSet}, $text, Encode::FB_PERLQQ); # Decode the string again for using in tests. The string should now be what # ends up in a topic after a TWiki edit using that charset. $text = Encode::decode($TWiki::cfg{Site}{CharSet}, $encoded_text, Encode::FB_PERLQQ); # Mapping high-bit characters from unicode back to iso-8859-1 # (a.k.a Windows 1252 a.k.a "ANSI") - http://www.alanwood.net/demos/ansi.html my %unicode2ANSI = ( chr(8364) => chr(128), chr(8218) => chr(130), chr(402) => chr(131), chr(8222) => chr(132), chr(8230) => chr(133), chr(8224) => chr(134), chr(8225) => chr(135), chr(710) => chr(136), chr(8240) => chr(137), chr(352) => chr(138), chr(8249) => chr(139), chr(338) => chr(140), chr(381) => chr(142), chr(8216) => chr(145), chr(8217) => chr(146), chr(8220) => chr(147), chr(8221) => chr(148), chr(8226) => chr(149), chr(8211) => chr(150), chr(8212) => chr(151), chr(732) => chr(152), chr(8482) => chr(153), chr(353) => chr(154), chr(8250) => chr(155), chr(339) => chr(156), chr(382) => chr(158), chr(376) => chr(159), ); # Reverse mapping my %ANSI2Unicode = map { $unicode2ANSI{$_} => $_ } keys %unicode2ANSI; my $unicode2ANSIChars = join('', keys %unicode2ANSI); my $ANSI2UnicodeChars = join('', keys %ANSI2Unicode); =pod ---++ RESTParameter2SiteCharSet($text) Text that is taken from a web page and added to the parameters of an XHR by JavaScript is UTF-8 encoded. This is because UTF-8 is the default encoding for XML, which XHR was designed to transport. This function is used to decode such parameters to the currently selected TWiki site character set. Note that this transform is not as simple as an Encode::from_to, as a number of unicode code points must be remapped for certain encodings. =cut sub RESTParameter2SiteCharSet { my ($text) = @_; $text = Encode::decode_utf8($text, Encode::FB_PERLQQ); if (Encode::resolve_alias($TWiki::cfg{Site}{CharSet}) eq 'iso-8859-1') { # Map unicode back to iso-8859 high-bit chars $text =~ s/([$unicode2ANSIChars])/$unicode2ANSI{$1}/ge; } $text = Encode::encode( $TWiki::cfg{Site}{CharSet}, $text, Encode::FB_PERLQQ); return $text; } =pod ---++ siteCharSet2RESTResult($text) Text that is taken from a web page and added to the parameters of an XHR by JavaScript is UTF-8 encoded. This is because UTF-8 is the default encoding for XML, which XHR was designed to transport. For usefulness in Javascript the response to an XHR should also be UTF-8 encoded. Note that this transform is not as simple as an Encode::from_to, as a number of unicode code points must be remapped for certain encodings. =cut sub siteCharSet2RESTResult { my ($text) = @_; $text = Encode::decode( $TWiki::cfg{Site}{CharSet}, $text, Encode::FB_PERLQQ); if (Encode::resolve_alias($TWiki::cfg{Site}{CharSet}) eq 'iso-8859-1') { # Map unicode back to iso-8859 high-bit chars $text =~ s/([$ANSI2UnicodeChars])/$ANSI2Unicode{$1}/ge; } $text = Encode::encode_utf8($text); return $text; } if ($action eq 'text2html') { my $t = RESTParameter2SiteCharSet($q->param('text')); # This is the string that TWiki requires. $t = siteCharSet2RESTResult($t); print "Content-type: text/plain;charset=UTF-8\r\n"; my $len; { use bytes; $len = length($t); }; print "Content-length: ",$len,"\r\n"; print "\r\n"; print $t; exit 0; } if ($action eq 'html2text') { # Converting from HTML back to the textarea. Equivalent of the # HTML2TML REST handler, and the save script. my $t = RESTParameter2SiteCharSet($q->param('text')); # This is the string that TWiki requires. $t = siteCharSet2RESTResult($t); print "Content-type: text/plain;charset=UTF-8\r\n"; my $len; { use bytes; $len = length($t); }; print "Content-length: ",$len,"\r\n"; print "\r\n"; print $t; exit 0; } my $page = $htmlPageBody; $page =~ s/%CHARSETS%/join(', ', Encode->encodings(":all"))/ge; $page =~ s/%FIRSTCHAR%/$firstchar/g; $page =~ s/%LASTCHAR%/$lastchar/g; $page =~ s/%CHARSET%/$TWiki::cfg{Site}{CharSet}/gs; # The text is encoded, so should be exactly what is stored on disk # in TWiki. $page =~ s/%TEXT%/$encoded_text/gs; if ($action eq 'save') { my $data = $q->param('text'); # This is a form submission, so the text should already be # encoded in the site charset. my $dir = File::Spec->tmpdir(); if (open(F, ">$dir/encodings_test")) { print F $data; close(F); $page =~ s/%SAVEFILENAME%/$dir\/encodings_test/; } else { $page =~ s/%SAVEFILENAME%/Failed: $!/; } $page =~ s/%SAVEDTEXT%/$data/; } print "Content-type: text/html\r\n\r\n"; print $page; 1;