Re: [xsl] decoding percent-escaped octet sequences

Subject: Re: [xsl] decoding percent-escaped octet sequences
From: Julian Reschke <julian.reschke@xxxxxx>
Date: Thu, 26 May 2011 10:56:02 +0200
On 2011-05-20 17:34, Julian Reschke wrote:
Hi,

do XSLT/XPath2 offer an elegant way to convert percent-escaped octet
sequences to strings (both for ISO-8859-1 and UTF-8).

So far I found codepoints-to-string, but that would mean that I'd still
have to to

1) percent-escaped-string to sequence of octets, and
2) sequence-of UTF-8 octets to seequence of codepoints.

Did I miss something here?

Best regards, Julian
...

Thanks for all the feedback. In the end I went for a pure XSLT2 implementation, supporting ISO-8859-1 and UTF-8. See below.


I'm doing a lot of XSLT 1.0 but not so much XSLT 2.0, so comments on how to make this more elegant are welcome.

XSLT (to be applied to some random XML):

<?xml version="1.0" encoding="ISO-8859-1"?>
<xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform";
               version="2.0"
               xmlns:xs="http://www.w3.org/2001/XMLSchema";

xmlns:myns="mailto:julian.reschke@xxxxxxxxxxxxx?subject=pctdecode";
               exclude-result-prefixes="myns"
>

<xsl:output method="xml" indent="yes"/>

<xsl:template match="/">
  <results>
    <xsl:copy-of select="myns:test('utf-8','A%20C')"/>
    <xsl:copy-of select="myns:test('iso-8859-1','A%20C')"/>
    <xsl:copy-of select="myns:test('utf-8','A%C3%A4')"/>
    <xsl:copy-of select="myns:test('iso-8859-1','A%E4')"/>
    <xsl:copy-of select="myns:test('utf-8','A%E4')"/>
  </results>
</xsl:template>

<xsl:function name="myns:test">
  <xsl:param name="enc"/>
  <xsl:param name="value"/>

  <result>
    <input>
      <enc><xsl:value-of select="$enc"/></enc>
      <value><xsl:value-of select="$value"/></value>
    </input>
    <parsed>
      <xsl:variable name="raw" select="myns:pct-decode($value)"/>

      <xsl:choose>
        <xsl:when test="lower-case($enc)='iso-8859-1'">
          <xsl:copy-of select="myns:decode-iso-8859-1($raw)"/>
        </xsl:when>
        <xsl:when test="lower-case($enc)='utf-8'">
          <xsl:copy-of select="myns:decode-utf-8($raw)"/>
        </xsl:when>
        <xsl:otherwise>
          <!-- unsupported encoding -->
        </xsl:otherwise>
      </xsl:choose>
    </parsed>
  </result>
</xsl:function>

<xsl:variable name="attr-char">!#\$&amp;\+\-\.\^_`\|~<xsl:value-of select="$DIGIT"/><xsl:value-of select="$ALPHA"/></xsl:variable>
<xsl:variable name="DIGIT">0-9</xsl:variable>
<xsl:variable name="ALPHA">a-zA-Z</xsl:variable>
<xsl:variable name="HEXDIG">a-fA-F<xsl:value-of select="$DIGIT"/></xsl:variable>
<xsl:variable name="pct-encoded">%[<xsl:value-of select="$HEXDIG"/>][<xsl:value-of select="$HEXDIG"/>]</xsl:variable>


<xsl:function name="myns:pct-decode">
  <xsl:param name="s"/>

<xsl:variable name="reg">(<xsl:value-of select="$pct-encoded"/>)|[<xsl:value-of select="$attr-char"/>]</xsl:variable>

<xsl:variable name="digits" select="('0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F')"/>

<xsl:analyze-string select="$s" regex="{$reg}" flags="mx">
<xsl:matching-substring>
<xsl:choose>
<xsl:when test="starts-with(.,'%')">
<xsl:variable name="a" select="index-of($digits,upper-case(substring(.,2,1)))-1"/>
<xsl:variable name="b" select="index-of($digits,upper-case(substring(.,3,1)))-1"/>
<xsl:variable name="cp" select="$a * 16 + $b"/>
<xsl:choose>
<xsl:when test="$cp &gt;= 128">
<octet><xsl:value-of select="$cp"/></octet>
</xsl:when>
<xsl:otherwise>
<c><xsl:value-of select="codepoints-to-string($cp)"/></c>
</xsl:otherwise>
</xsl:choose>
</xsl:when>
<xsl:otherwise>
<!-- single character -->
<c><xsl:value-of select="."/></c>
</xsl:otherwise>
</xsl:choose>
</xsl:matching-substring>
</xsl:analyze-string>


</xsl:function>

<xsl:function name="myns:decode-iso-8859-1">
  <xsl:param name="s"/>

  <xsl:variable name="result">
    <xsl:for-each select="$s">
      <xsl:choose>
        <xsl:when test="self::octet">
          <xsl:choose>
            <xsl:when test=". &gt; 127 and . &lt; 160">
              <illegal-octet><xsl:value-of select="."/></illegal-octet>
            </xsl:when>
            <xsl:otherwise>
              <xsl:value-of select="codepoints-to-string(.)"/>
            </xsl:otherwise>
          </xsl:choose>
        </xsl:when>
        <xsl:otherwise>
          <xsl:value-of select="."/>
        </xsl:otherwise>
      </xsl:choose>
    </xsl:for-each>
  </xsl:variable>

<xsl:choose>
<xsl:when test="$result/illegal-octet">
<illegal-octet><xsl:value-of select="$result/illegal-octet"/></illegal-octet>
</xsl:when>
<xsl:otherwise>
<string><xsl:value-of select="$result"/></string>
</xsl:otherwise>
</xsl:choose>
</xsl:function>


<xsl:function name="myns:decode-utf-8">
  <xsl:param name="s"/>

<xsl:variable name="octets">
<xsl:for-each select="$s">
<o>
<xsl:choose>
<xsl:when test="self::octet"><xsl:value-of select="."/></xsl:when>
<xsl:otherwise><xsl:value-of select="string-to-codepoints(.)"/></xsl:otherwise>
</xsl:choose>
</o>
</xsl:for-each>
</xsl:variable>


  <xsl:variable name="result">
    <xsl:call-template name="internal-utf8">
      <xsl:with-param name="octets" select="$octets/*"/>
    </xsl:call-template>
  </xsl:variable>

<xsl:choose>
<xsl:when test="$result/illegal-octet">
<illegal-octet><xsl:value-of select="$result/illegal-octet"/></illegal-octet>
</xsl:when>
<xsl:otherwise>
<string><xsl:value-of select="$result"/></string>
</xsl:otherwise>
</xsl:choose>
</xsl:function>


<xsl:template name="internal-utf8">
  <xsl:param name="octets"/>

<xsl:choose>
<xsl:when test="not($octets)"><!--done--></xsl:when>
<xsl:when test="count($octets) >= 4 and $octets[1] >= 240 and $octets[2] >= 128 and $octets[3] >= 128 and $octets[4] >= 128">
<xsl:value-of select="codepoints-to-string(xs:integer(((((($octets[1] mod 32) * 64) + ($octets[2] mod 32)) * 64) + ($octets[3] mod 64) * 64) + ($octets[4] mod 64)))"/>
<xsl:call-template name="internal-utf8">
<xsl:with-param name="octets" select="$octets[position() > 4]"/>
</xsl:call-template>
</xsl:when>
<xsl:when test="count($octets) >= 3 and $octets[1] >= 224 and $octets[2] >= 128 and $octets[3] >= 128">
<xsl:value-of select="codepoints-to-string(xs:integer((((($octets[1] mod 32) * 64) + ($octets[2] mod 32)) * 64) + ($octets[3] mod 64)))"/>
<xsl:call-template name="internal-utf8">
<xsl:with-param name="octets" select="$octets[position() > 3]"/>
</xsl:call-template>
</xsl:when>
<xsl:when test="count($octets) >= 2 and $octets[1] >= 192 and $octets[2] >= 128">
<xsl:value-of select="codepoints-to-string(xs:integer((($octets[1] mod 32) * 64) + ($octets[2] mod 64)))"/>
<xsl:call-template name="internal-utf8">
<xsl:with-param name="octets" select="$octets[position() > 2]"/>
</xsl:call-template>
</xsl:when>
<xsl:when test="$octets[1] &lt; 128">
<xsl:value-of select="codepoints-to-string($octets[1])"/>
<xsl:call-template name="internal-utf8">
<xsl:with-param name="octets" select="$octets[position() > 1]"/>
</xsl:call-template>
</xsl:when>
<xsl:otherwise>
<illegal-octet><xsl:value-of select="$octets[1]"/></illegal-octet>
</xsl:otherwise>
</xsl:choose>


</xsl:template>

</xsl:transform>

Output:

<?xml version="1.0" encoding="UTF-8"?>
<results xmlns:xs="http://www.w3.org/2001/XMLSchema";>
   <result>
      <input>
         <enc>utf-8</enc>
         <value>A%20C</value>
      </input>
      <parsed>
         <string>A C</string>
      </parsed>
   </result>
   <result>
      <input>
         <enc>iso-8859-1</enc>
         <value>A%20C</value>
      </input>
      <parsed>
         <string>A C</string>
      </parsed>
   </result>
   <result>
      <input>
         <enc>utf-8</enc>
         <value>A%C3%A4</value>
      </input>
      <parsed>
         <string>Ad</string>
      </parsed>
   </result>
   <result>
      <input>
         <enc>iso-8859-1</enc>
         <value>A%E4</value>
      </input>
      <parsed>
         <string>Ad</string>
      </parsed>
   </result>
   <result>
      <input>
         <enc>utf-8</enc>
         <value>A%E4</value>
      </input>
      <parsed>
         <illegal-octet>228</illegal-octet>
      </parsed>
   </result>
</results>

Best regards, Julian

Current Thread