RE: [xsl] output encoding problem (can't get UTF-8)

Subject: RE: [xsl] output encoding problem (can't get UTF-8)
From: "Gabriel K" <gabriel.klappenbach@xxxxxxxxxxxx>
Date: Sun, 7 Aug 2005 10:56:16 +0200
Yeah, I have kind of been forced to learn more about this messed up encoding
stuff, so I know the stuff you said below. :P
However, others and myself can transform my XML correctly when done not via
the ASP page - please see my previous mail.

Thanks for your mail, it was interesting reading. :)
And sorry about the "SV:", I'm using outlook and it seems you can't change
that.

/Gabriel


> -----Ursprungligt meddelande-----
> Fren: Soren Kuula [mailto:dongfang@xxxxxxxxxxx]
> Skickat: den 7 augusti 2005 00:19
> Till: xsl-list@xxxxxxxxxxxxxxxxxxxxxx
> Dmne: Re: SV: [xsl] output encoding problem (can't get UTF-8)
>
> Gabriel K wrote:
>
> >That's what I'm thinking too, but the output has screwed up E D V in all
> >places, not just links. Very weird. I can't think of why this would
> happen.
> >:/
> >
> >
> Hi,
>
> Many people are not erally aware what this encoding thing is about...
> are you sure that the encoding you have DECLARED is the same as the one
> you have APPLIED throughout? The encoding you declare in xml is that in
> <?xml encoding="foobar"?>. The encoding you actually use for the
> document is the one that your editor, or the tool that generated it,
> saved it in. They must be the same.......
>
> With utf-8, a character expanding into several is caused by:
>
> - Something writes the document, in utf-8.
> - Something reads it, thinking/guessing/defaulting to/being fooled into
> thinking that it's Latin-1 or some similar a-character-is-a-byte encoding.
> - The above is repeated zero or more times.
>
> (the opposite phonomenon, characters disappering, is when programs think
> they're reading utf-8 but the data is not utf-8...)
>
> I cooked up some Java once to demonstrate it: Utf-8 is digested as it
> were being one byte a char (as Latin-1). This is repeated, a big mess is
> made, and the opposite error is then applied twice to clean it up :)
>
>
> If you know emacs, you can inspect an xml document's encoding:
>
> C-x <return> C <now enter the name of the encoding that the xml header
> claims is used> <return> <C-x><C-f><enter filename>. That is: force
> emacs to use a specific encoding for the folling command, which is then
> 'read the file'. If now f x e d v are bad, then the encoding in the xml
> header is lying.....
>
> One more thing: Can you use a mail client that writes Re: in the header,
> not SV: ? We get confused... (and mail ends up in the wrong folder)
>
> Soren
>
> /**
> * @author dongfang
> * See: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8, first table on
> * the page...
> */
> public class Encoder {
>   private static int taillength_by_headvalue(int headvalue) {
>     if (headvalue >= 0xFC)
>       return 5;
>     if (headvalue >= 0xF8)
>       return 4;
>     if (headvalue >= 0xF0)
>       return 3;
>     if (headvalue >= 0xE0)
>       return 2;
>     if (headvalue >= 0xC0)
>       return 1;
>     return 0;
>   }
>
>   private static int headmask(int headvalue) {
>     if (headvalue >= 0xFC)
>       return 0x1;
>     if (headvalue >= 0xF8)
>       return 0x3;
>     if (headvalue >= 0xF0)
>       return 0x7;
>     if (headvalue >= 0xE0)
>       return 0xF;
>     if (headvalue >= 0xC0)
>       return 0x1F;
>     return 0x7F;
>   }
>
>   public static int decode(int[] bytes, int startpos, int[] resulteater,
> int resultpos) {
>     int numTrailers = taillength_by_headvalue(bytes[startpos]);
>     int result = 0;
>     int displacement = 0;
>     for (int i=numTrailers; i>0; i--) {
>       result+=(bytes[startpos + i] & 0x3F) << displacement;
>       displacement += 6;
>     }
>     result += (bytes[startpos] & headmask(bytes[startpos])) <<
> displacement;
>     resulteater[resultpos] = result;
>     return startpos + numTrailers + 1;
>   }
>
>   public static int decodeAll(int[] bytes, int startpos, int stoppos,
> int[] result, int resultpos) {
>     while(startpos < stoppos) {
>       startpos=decode(bytes, startpos, result, resultpos++);
>     }
>     return resultpos;
>   }
>
>   public static int decodeAll(int[] bytes, int[] result) {
>     return decodeAll(bytes, 0, bytes.length, result, 0);
>   }
>
>   private static int taillength_by_codepoint(int data) {
>     if (data < 0x80)
>       return 0;
>     if (data < 0x800)
>       return 1;
>     if (data < 0x10000)
>       return 2;
>     if (data < 0x200000)
>       return 3;
>     if (data < 0x4000000)
>       return 4;
>     return 5;
>   }
>
>   private static int headvalue_by_codepoint(int data) {
>     if (data < 0x80)
>       return 0;
>     if (data < 0x800)
>       return 0xC0;
>     if (data < 0x10000)
>       return 0xE0;
>     if (data < 0x200000)
>       return 0xF0;
>     if (data < 0x4000000)
>       return 0xF8;
>     return 0xFC;
>   }
>
>   public static int encode(int data, int[] result, int resultpos) {
>     int length = taillength_by_codepoint(data);
>     int temp = data;
>     for (int i=length; i>0; i--) {
>       result[resultpos + i] = (temp & 0x3F) | 0x80;
>       temp >>>=6;
>     }
>     result[resultpos] = temp | headvalue_by_codepoint(data);
>     return resultpos + length+1;
>   }
>
>   public static int encode_many(int[] data, int startpos, int stoppos,
> int[] result, int resultpos) {
>     for (int i=startpos; i<stoppos; i++) {
>       resultpos=encode(data[i], result, resultpos);
>     }
>     return resultpos;
>   }
>
>   public static int encode_all(int[] data, int[] result, int resultpos) {
>     return encode_many(data, 0, data.length, result, resultpos);
>   }
>
>   public static void dump(int[] data, int no) {
>     for (int i=0; i<no; i++) {
>       System.out.print(data[i]); // + "(" + (char)result[i] + ")");
>       if (i<no-1)
>         System.out.print(", ");
>     }
>     System.out.print(" (");
>     for (int i=0; i<no; i++) {
>       System.out.print((char)data[i]);
>     }
>     System.out.println(")");
>   }
>
>   public static void main (String[] args) {
>     int[] singlycoded = new int[1000];
>     //int no = decodeAll(new int[]{0xC3, 0x83, 0xE8, 0x80, 0xA6}, result);
>     int[] original = new int[]{'f','x','e','F','X','E'};
>
>     System.out.println("\nThis string (Unicode code points in
> paranthesis):");
>     dump(original, original.length);
>
>     int sth = encode_all(original, singlycoded, 0);
>     System.out.println("\nis encoded into these UTF-8 bytes. If that is
> then improperly interpreted as Latin-1, it will be interpreted as the
> characters in the parenthesis");
>     dump(singlycoded, sth);
>
>     int[] doublycoded = new int[1000];
>     sth = encode_many(singlycoded, 0, sth, doublycoded, 0);
>     System.out.println("\nThe bad interpretation encoded as Unicode. In
> the parenthesis are the results of repeating the above error of
> interpreting UTF-8 as Latin-1");
>     dump(doublycoded, sth);
>
>     int[] decoded = new int[1000];
>     sth = decodeAll(doublycoded, 0, sth, decoded, 0);
>     System.out.println("\nDecoding the above yields:");
>     dump(decoded, sth);
>
>     int[] decoded_again = new int[1000];
>     sth = decodeAll(decoded, 0, sth, decoded_again, 0);
>     System.out.println("\nDecoding again gives us:");
>     dump(decoded_again, sth);
>   }
> }

Current Thread