| Subject: RE: [xsl] output encoding problem (can't get UTF-8) From: "Gabriel K" <gabriel.klappenbach@xxxxxxxxxxxx> Date: Sun, 7 Aug 2005 10:56:16 +0200 | 
Yeah, I have kind of been forced to learn more about this messed up encoding stuff, so I know the stuff you said below. :P However, others and myself can transform my XML correctly when done not via the ASP page - please see my previous mail. Thanks for your mail, it was interesting reading. :) And sorry about the "SV:", I'm using outlook and it seems you can't change that. /Gabriel > -----Ursprungligt meddelande----- > Fren: Soren Kuula [mailto:dongfang@xxxxxxxxxxx] > Skickat: den 7 augusti 2005 00:19 > Till: xsl-list@xxxxxxxxxxxxxxxxxxxxxx > Dmne: Re: SV: [xsl] output encoding problem (can't get UTF-8) > > Gabriel K wrote: > > >That's what I'm thinking too, but the output has screwed up E D V in all > >places, not just links. Very weird. I can't think of why this would > happen. > >:/ > > > > > Hi, > > Many people are not erally aware what this encoding thing is about... > are you sure that the encoding you have DECLARED is the same as the one > you have APPLIED throughout? The encoding you declare in xml is that in > <?xml encoding="foobar"?>. The encoding you actually use for the > document is the one that your editor, or the tool that generated it, > saved it in. They must be the same....... > > With utf-8, a character expanding into several is caused by: > > - Something writes the document, in utf-8. > - Something reads it, thinking/guessing/defaulting to/being fooled into > thinking that it's Latin-1 or some similar a-character-is-a-byte encoding. > - The above is repeated zero or more times. > > (the opposite phonomenon, characters disappering, is when programs think > they're reading utf-8 but the data is not utf-8...) > > I cooked up some Java once to demonstrate it: Utf-8 is digested as it > were being one byte a char (as Latin-1). This is repeated, a big mess is > made, and the opposite error is then applied twice to clean it up :) > > > If you know emacs, you can inspect an xml document's encoding: > > C-x <return> C <now enter the name of the encoding that the xml header > claims is used> <return> <C-x><C-f><enter filename>. That is: force > emacs to use a specific encoding for the folling command, which is then > 'read the file'. If now f x e d v are bad, then the encoding in the xml > header is lying..... > > One more thing: Can you use a mail client that writes Re: in the header, > not SV: ? We get confused... (and mail ends up in the wrong folder) > > Soren > > /** > * @author dongfang > * See: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8, first table on > * the page... > */ > public class Encoder { > private static int taillength_by_headvalue(int headvalue) { > if (headvalue >= 0xFC) > return 5; > if (headvalue >= 0xF8) > return 4; > if (headvalue >= 0xF0) > return 3; > if (headvalue >= 0xE0) > return 2; > if (headvalue >= 0xC0) > return 1; > return 0; > } > > private static int headmask(int headvalue) { > if (headvalue >= 0xFC) > return 0x1; > if (headvalue >= 0xF8) > return 0x3; > if (headvalue >= 0xF0) > return 0x7; > if (headvalue >= 0xE0) > return 0xF; > if (headvalue >= 0xC0) > return 0x1F; > return 0x7F; > } > > public static int decode(int[] bytes, int startpos, int[] resulteater, > int resultpos) { > int numTrailers = taillength_by_headvalue(bytes[startpos]); > int result = 0; > int displacement = 0; > for (int i=numTrailers; i>0; i--) { > result+=(bytes[startpos + i] & 0x3F) << displacement; > displacement += 6; > } > result += (bytes[startpos] & headmask(bytes[startpos])) << > displacement; > resulteater[resultpos] = result; > return startpos + numTrailers + 1; > } > > public static int decodeAll(int[] bytes, int startpos, int stoppos, > int[] result, int resultpos) { > while(startpos < stoppos) { > startpos=decode(bytes, startpos, result, resultpos++); > } > return resultpos; > } > > public static int decodeAll(int[] bytes, int[] result) { > return decodeAll(bytes, 0, bytes.length, result, 0); > } > > private static int taillength_by_codepoint(int data) { > if (data < 0x80) > return 0; > if (data < 0x800) > return 1; > if (data < 0x10000) > return 2; > if (data < 0x200000) > return 3; > if (data < 0x4000000) > return 4; > return 5; > } > > private static int headvalue_by_codepoint(int data) { > if (data < 0x80) > return 0; > if (data < 0x800) > return 0xC0; > if (data < 0x10000) > return 0xE0; > if (data < 0x200000) > return 0xF0; > if (data < 0x4000000) > return 0xF8; > return 0xFC; > } > > public static int encode(int data, int[] result, int resultpos) { > int length = taillength_by_codepoint(data); > int temp = data; > for (int i=length; i>0; i--) { > result[resultpos + i] = (temp & 0x3F) | 0x80; > temp >>>=6; > } > result[resultpos] = temp | headvalue_by_codepoint(data); > return resultpos + length+1; > } > > public static int encode_many(int[] data, int startpos, int stoppos, > int[] result, int resultpos) { > for (int i=startpos; i<stoppos; i++) { > resultpos=encode(data[i], result, resultpos); > } > return resultpos; > } > > public static int encode_all(int[] data, int[] result, int resultpos) { > return encode_many(data, 0, data.length, result, resultpos); > } > > public static void dump(int[] data, int no) { > for (int i=0; i<no; i++) { > System.out.print(data[i]); // + "(" + (char)result[i] + ")"); > if (i<no-1) > System.out.print(", "); > } > System.out.print(" ("); > for (int i=0; i<no; i++) { > System.out.print((char)data[i]); > } > System.out.println(")"); > } > > public static void main (String[] args) { > int[] singlycoded = new int[1000]; > //int no = decodeAll(new int[]{0xC3, 0x83, 0xE8, 0x80, 0xA6}, result); > int[] original = new int[]{'f','x','e','F','X','E'}; > > System.out.println("\nThis string (Unicode code points in > paranthesis):"); > dump(original, original.length); > > int sth = encode_all(original, singlycoded, 0); > System.out.println("\nis encoded into these UTF-8 bytes. If that is > then improperly interpreted as Latin-1, it will be interpreted as the > characters in the parenthesis"); > dump(singlycoded, sth); > > int[] doublycoded = new int[1000]; > sth = encode_many(singlycoded, 0, sth, doublycoded, 0); > System.out.println("\nThe bad interpretation encoded as Unicode. In > the parenthesis are the results of repeating the above error of > interpreting UTF-8 as Latin-1"); > dump(doublycoded, sth); > > int[] decoded = new int[1000]; > sth = decodeAll(doublycoded, 0, sth, decoded, 0); > System.out.println("\nDecoding the above yields:"); > dump(decoded, sth); > > int[] decoded_again = new int[1000]; > sth = decodeAll(decoded, 0, sth, decoded_again, 0); > System.out.println("\nDecoding again gives us:"); > dump(decoded_again, sth); > } > }
| Current Thread | 
|---|
| 
 
 | 
| <- Previous | Index | Next -> | 
|---|---|---|
| Re: SV: [xsl] output encoding probl, Soren Kuula | Thread | [xsl] XSL/XHTML IDE, Oleg Konovalov | 
| Re: [xsl] output encoding problem (, Joe Fawcett | Date | RE: [xsl] output encoding problem (, Gabriel K | 
| Month |