[xsl] URI escaping

Subject: [xsl] URI escaping
From: "Wesley W. Terpstra" <wesley@xxxxxxxxxxx>
Date: Sun, 18 Aug 2002 20:59:37 +0200
Ok, so I now have something that works well for escaping strings in xsl.

The attached xsl code should work with utf-8 for URIs since the RFC
guarantees for non-ascii characters both the xslt engine AND the browser
will utf-8 encode then hexify when seeing non-ascii in a uri.

However, this means that this only works for URIs; one can't use
translate(..., '%'. '=') on the output and expect it to work.
(which is what I was wanting since I want to output email headers with
utf-8 encoding)

For this case, I automatically detect if a correct escape-uri(...) function
is present in the xslt engine. If so, we use that and therefore the
translate trick works since escape-uri will utf-8 and hexify the passed
string.

If there is no escape-uri to deal with utf-8, it will default to outputing
?s instead of the real characters. This can be overridden to use the uri
hack outlined above.

This code obeys the RFCto the letter. (I even deal with the nasty % case)

The entry point to the xsl code is:

<xsl:call-template name="my-escape-uri">
 <xsl:with-param name="str" select="'the url here'"/>
 <xsl:with-param name="allow-utf8" select="true()"/>
</xsl:call-template>

Also attached is a patch which adds escape-uri xpath code to libxml2.

I imagine similar code could be derived from this for other xslt engines.
I am mainly attaching the C since the if (...) is tested and works and 
could be re-used.

These two functions behave in exactly the same way --- based off the RFC.

Comments?

-- 
Wesley W. Terpstra <wesley@xxxxxxxxxxx>

--------------
escape-uri.xsl
--------------

<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<xsl:stylesheet 
    xmlns:xsl="http://www.w3.org/1999/XSL/Transform";
    xmlns="http://www.w3.org/1999/xhtml";
    version="1.0">

<xsl:output method="html" indent="no" encoding="UTF-8"
    doctype-system="http://www.w3.org/TR/html4/strict.dtd";
    doctype-public="-//W3C//DTD HTML 4.0 Transitional//EN"/>

<!-- Escape URIs -->

<xsl:variable name="uri-input">The@dog-z went/_&#127;_%ab%zu%c</xsl:variable>
<xsl:variable name="uri-output">The%40dog-z%20went%2F_%7F_%ab%25zu%25c</xsl:variable>
<xsl:variable name="have-escape-uri" select="escape-uri($uri-input, true()) = $uri-output"/>

<!-- According to the RFC, non-ascii chars will be utf-8 encoded and escaped
     with %s by the xslt-engine when in a 'uri' attribute or by the browser
     if the xlst-engine doesn't. This is ok, but not enough since we still
     won't have working RFC822 (email) Froms! since they need =s. 
     However, as there is nothing I can do about this, I will just hope for
     the best if an xslt engine doesn't have uri-escape. -->
<xsl:variable name="ascii-charset"> !&quot;#$%&amp;&apos;()*+,-./0123456789:;&lt;=&gt;?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~&#127;</xsl:variable>
<xsl:variable name="uri-ok">-_.!~*&apos;()0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz</xsl:variable>
<xsl:variable name="hex">0123456789ABCDEFabcdef</xsl:variable>

<xsl:template name="do-escape-uri">
 <xsl:param name="str"/>
 <xsl:param name="allow-utf8"/>

 <xsl:if test="$str">
  <xsl:variable name="first-char" select="substring($str,1,1)"/>
  <xsl:choose>
   <xsl:when test="$first-char = '%' and string-length($str) &gt;= 3 and contains($hex, substring($str,2,1)) and contains($hex, substring($str,3,1))">
    <!-- The percent char is ok IF it followed by a valid hex pair -->
    <xsl:value-of select="$first-char"/>
   </xsl:when>
   <xsl:when test="contains($uri-ok, $first-char)">
    <!-- This char is ok inside urls -->
    <xsl:value-of select="$first-char"/>
   </xsl:when>
   <xsl:when test="not(contains($ascii-charset, $first-char))">
    <!-- Non-ascii output raw based on utf8 allowed or not -->
    <xsl:choose>
     <xsl:when test="$allow-utf8">
      <xsl:value-of select="$first-char"/>
     </xsl:when>
     <xsl:otherwise>
      <xsl:text>%3F</xsl:text>
     </xsl:otherwise>
    </xsl:choose>
   </xsl:when>
   <xsl:otherwise>
    <!-- URL escape this char -->
    <xsl:variable name="ascii-value" select="string-length(substring-before($ascii-charset,$first-char)) + 32"/>
    <xsl:text>%</xsl:text>
    <xsl:value-of select="substring($hex,floor($ascii-value div 16) + 1,1)"/>
    <xsl:value-of select="substring($hex,$ascii-value mod 16 + 1,1)"/>
   </xsl:otherwise>
  </xsl:choose>
  
  <xsl:call-template name="do-escape-uri">
   <xsl:with-param name="str" select="substring($str,2)"/>
   <xsl:with-param name="allow-utf8" select="$allow-utf8"/>
  </xsl:call-template>
 </xsl:if>
</xsl:template>

<xsl:template name="my-escape-uri">
 <xsl:param name="str"/>
 <xsl:param name="allow-utf8"/>
 
 <xsl:choose>
  <xsl:when test="$have-escape-uri">
   <xsl:value-of select="escape-uri($str, true())"/>
  </xsl:when>
  <xsl:otherwise>
   <xsl:call-template name="do-escape-uri">
    <xsl:with-param name="str" select="$str"/>
    <xsl:with-param name="allow-utf8" select="$allow-utf8"/>
   </xsl:call-template>
  </xsl:otherwise>
 </xsl:choose>
</xsl:template>

<xsl:template match="/">
 <html>
  <xsl:call-template name="my-escape-uri">
   <xsl:with-param name="str" select="'The@dog-z went/_&#255;&#127;_%ab%zu%c'"/>
  </xsl:call-template>
 </html>
</xsl:template>

</xsl:stylesheet>

----------------
escape-uri.patch
----------------

diff -rc libxml2-2.4.23.orig/xpath.c libxml2-2.4.23/xpath.c
*** libxml2-2.4.23.orig/xpath.c	Tue Jul  2 04:35:15 2002
--- libxml2-2.4.23/xpath.c	Sun Aug 18 03:47:58 2002
***************
*** 6457,6462 ****
--- 6457,6570 ----
  }
  
  /**
+  * xmlXPathEscapeUriFunction:
+  * @ctxt:  the XPath Parser context
+  * @nargs:  the number of arguments
+  *
+  * Implement the escape-uri() XPath function
+  *    string escape-uri(string $str, bool $escape-reserved)
+  *
+  * This function applies the URI escaping rules defined in section 2 of [RFC
+  * 2396] to the string supplied as $uri-part, which typically represents all
+  * or part of a URI. The effect of the function is to replace any special
+  * character in the string by an escape sequence of the form %xx%yy...,
+  * where xxyy... is the hexadecimal representation of the octets used to
+  * represent the character in UTF-8.
+  *
+  * The set of characters that are escaped depends on the setting of the
+  * boolean argument $escape-reserved.
+  *
+  * If $escape-reserved is true, all characters are escaped other than lower
+  * case letters a-z, upper case letters A-Z, digits 0-9, and the characters
+  * referred to in [RFC 2396] as "marks": specifically, "-" | "_" | "." | "!"
+  * | "~" | "*" | "'" | "(" | ")". The "%" character itself is escaped only
+  * if it is not followed by two hexadecimal digits (that is, 0-9, a-f, and
+  * A-F).
+  *
+  * If $escape-reserved is false, the behavior differs in that characters
+  * referred to in [RFC 2396] as reserved characters are not escaped. These
+  * characters are ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | "$" | ",".
+  * 
+  * [RFC 2396] does not define whether escaped URIs should use lower case or
+  * upper case for hexadecimal digits. To ensure that escaped URIs can be
+  * compared using string comparison functions, this function must always use
+  * the upper-case letters A-F.
+  * 
+  * Generally, $escape-reserved should be set to true when escaping a string
+  * that is to form a single part of a URI, and to false when escaping an
+  * entire URI or URI reference.
+  * 
+  * In the case of non-ascii characters, the string is encoded according to 
+  * utf-8 and then converted according to RFC 2396.
+  *
+  * Examples
+  *  xf:escape-uri ("gopher://spinaltap.micro.umn.edu/00/Weather/California/Los%20Angeles#ocean";), true()) 
+  *  returns "gopher%3A%2F%2Fspinaltap.micro.umn.edu%2F00%2FWeather%2FCalifornia%2FLos%20Angeles%23ocean"
+  *  xf:escape-uri ("gopher://spinaltap.micro.umn.edu/00/Weather/California/Los%20Angeles#ocean";), false())
+  *  returns "gopher://spinaltap.micro.umn.edu/00/Weather/California/Los%20Angeles%23ocean";
+  *
+  */
+ void
+ xmlXPathEscapeUriFunction(xmlXPathParserContextPtr ctxt, int nargs) {
+     xmlXPathObjectPtr str;
+     int escape_reserved;
+     xmlBufferPtr target;
+     xmlChar *cptr;
+     xmlChar escape[4];
+     
+     CHECK_ARITY(2);
+     
+     escape_reserved = xmlXPathPopBoolean(ctxt);
+     
+     CAST_TO_STRING;
+     str = valuePop(ctxt);
+     
+     target = xmlBufferCreate();
+     
+     escape[0] = '%';
+     escape[3] = 0;
+     
+     if (target) {
+ 	for (cptr = str->stringval; *cptr; cptr++) {
+ 	    if ((*cptr >= 'A' && *cptr <= 'Z') ||
+ 		(*cptr >= 'a' && *cptr <= 'z') ||
+ 		(*cptr >= '0' && *cptr <= '9') ||
+ 		*cptr == '-' || *cptr == '_' || *cptr == '.' || 
+ 		*cptr == '!' || *cptr == '~' || *cptr == '*' ||
+ 		*cptr == '\''|| *cptr == '(' || *cptr == ')' ||
+ 		(*cptr == '%' && 
+ 		 ((cptr[1] >= 'A' && cptr[1] <= 'F') ||
+ 		  (cptr[1] >= 'a' && cptr[1] <= 'f') ||
+ 		  (cptr[1] >= '0' && cptr[1] <= '9')) &&
+ 		 ((cptr[2] >= 'A' && cptr[2] <= 'F') ||
+ 		  (cptr[2] >= 'a' && cptr[2] <= 'f') ||
+ 		  (cptr[2] >= '0' && cptr[2] <= '9'))) ||
+ 		(!escape_reserved &&
+ 		 (*cptr == ';' || *cptr == '/' || *cptr == '?' ||
+ 		  *cptr == ':' || *cptr == '@' || *cptr == '&' ||
+ 		  *cptr == '=' || *cptr == '+' || *cptr == '$' ||
+ 		  *cptr == ','))) {
+ 		xmlBufferAdd(target, cptr, 1);
+ 	    } else {
+ 		if ((*cptr >> 4) < 10)
+ 		    escape[1] = '0' + (*cptr >> 4);
+ 		else
+ 		    escape[1] = 'A' - 10 + (*cptr >> 4);
+ 		if ((*cptr & 0xF) < 10)
+ 		    escape[2] = '0' + (*cptr & 0xF);
+ 		else
+ 		    escape[2] = 'A' - 10 + (*cptr & 0xF);
+ 		
+ 		xmlBufferAdd(target, &escape[0], 3);
+ 	    }
+ 	}
+     }
+     valuePush(ctxt, xmlXPathNewString(xmlBufferContent(target)));
+     xmlBufferFree(target);
+     xmlXPathFreeObject(str);
+ }
+ 
+ /**
   * xmlXPathBooleanFunction:
   * @ctxt:  the XPath Parser context
   * @nargs:  the number of arguments
***************
*** 10646,10651 ****
--- 10755,10762 ----
                           xmlXPathContainsFunction);
      xmlXPathRegisterFunc(ctxt, (const xmlChar *)"id",
                           xmlXPathIdFunction);
+     xmlXPathRegisterFunc(ctxt, (const xmlChar *)"escape-uri",
+                          xmlXPathEscapeUriFunction);
      xmlXPathRegisterFunc(ctxt, (const xmlChar *)"false",
                           xmlXPathFalseFunction);
      xmlXPathRegisterFunc(ctxt, (const xmlChar *)"floor",

 XSL-List info and archive:  http://www.mulberrytech.com/xsl/xsl-list


Current Thread