[xsl] Sorting substitution instructions by max. length of matches

Subject: [xsl] Sorting substitution instructions by max. length of matches
From: "Yves Forkl (SRZ)" <Y.Forkl@xxxxxx>
Date: Fri, 05 Oct 2007 16:27:15 +0200
(Though not absolutely trivial, the problem described below seems to be common enough that its XSLT 2 solution should have been discussed somewhere already, but I didn't find anything like this anywhere. Do you know of such a resource?)

Given a sequence of substitution instructions in $substitution_instructions, each represented as a node

<substitution>
 <old>regex</old>
 <new>replacement</new>
</substitution>

specifying the regex pattern to search for and the match's replacement, the task is to sort the substitution instructions by the length of their longest match in the input text held in $input.

The aim, of course, is to be able to perform multiple substitutions in the same input while replacing longer matches first, in order to avoid the classical matching prefix problem.

(The actual replacing procedure would be a topic of its own; just assume that the replacement inserted into the input text will be safe from being changed itself again. So even if the input text cannot be considered constant across the substitutions, their proper order of application should be independent of the replacements made, as far as I can see, because if the maximum match length for some regex might decrease, it will never increase when some of its matches disappear through the replacing of overlapping matches. Correct me if I'm being naive here.)

I have come up with a solution based on these steps:

1) Construct a new sequence of substitution instructions that include the maximum match length for each of them, removing all substitution instructions with no match.

2) Sort the substitution instructions first by maximum match length, then alphabetically (i.e. as strings according to the implementation-defined collation) by their regexes. (This second sort key component just serves to produce predictable behavior in case of equal lengths.)

An implementation of this approach is demonstrated by the XSLT 2 stylesheet below (to be passed to Saxon with "-it main"). My question is if you have any suggestions for making the code simpler or more elegant, or if you would recommend taking another way.

Yves


<xsl:stylesheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"; xmlns:xs="http://www.w3.org/2001/XMLSchema"; xmlns:my="http://xmlns.srz.de/yforkl/xslt/functions"; exclude-result-prefixes="my xs">

<!-- using dashes in function names, underscores in variable names -->

  <xsl:function name="my:annex-max-match-length-to-instruction"
    as="node()*">
    <xsl:param name="inputtext" as="xs:string"/>
    <xsl:param name="substitution_instruction" as="node()"/>

    <xsl:variable name="regex"
      select="string($substitution_instruction/old)"/>
    <xsl:variable name="max_match_length"
      select="max(my:lengths-of-remaining-matches($inputtext, $regex))"/>

    <!-- Add max. match length to subst. instruction, drop those without
         matches -->
    <xsl:if test="$max_match_length ne 0">
      <xsl:element name="substitution">
        <xsl:element name="max_match_length">
          <xsl:sequence select="$max_match_length"/>
        </xsl:element>
        <xsl:copy-of select="$substitution_instruction/old"/>
        <xsl:copy-of select="$substitution_instruction/new"/>
      </xsl:element>
    </xsl:if>
  </xsl:function>

  <xsl:function name="my:lengths-of-remaining-matches" as="xs:integer*">
    <xsl:param name="inputtext" as="xs:string*"/>
    <xsl:param name="regex" as="xs:string"/>

    <xsl:variable name="length_of_next_match"
      select="if (not (matches($inputtext, $regex)))
                 then 0
                 else
                   string-length(replace($inputtext,
                                 concat('^.*?(', $regex, ').*'),
                                 '$1'))"/>

    <!-- Return 0 if no match (zero-length regexes can't occur) -->
    <xsl:sequence select="$length_of_next_match"/>

    <!-- Examine further matches by recursive call -->
    <xsl:if test="$length_of_next_match ne 0">
      <xsl:sequence
        select="my:lengths-of-remaining-matches(
          replace($inputtext, concat('^.*?', $regex), ''),
          $regex)"/>
    </xsl:if>
  </xsl:function>

<xsl:template name="main">

    <!-- sample data -->
    <xsl:variable name="input"
      select="'abcddddxxxxxxxyyyabcxxxxxabcdxabc'"/>

    <!-- sample data -->
    <xsl:variable name="substitution_instructions">
      <xsl:element name="substitution">
        <xsl:element name="old">[a-e]+</xsl:element>
        <xsl:element name="new">***</xsl:element>
      </xsl:element>
      <xsl:element name="substitution">
        <xsl:element name="old">d</xsl:element>
        <xsl:element name="new">#</xsl:element>
      </xsl:element>
      <xsl:element name="substitution">
        <xsl:element name="old">123</xsl:element>
        <xsl:element name="new">...</xsl:element>
      </xsl:element>
      <xsl:element name="substitution">
        <xsl:element name="old">c+</xsl:element>
        <xsl:element name="new">#</xsl:element>
      </xsl:element>
      <xsl:element name="substitution">
        <xsl:element name="old">x+y*</xsl:element>
        <xsl:element name="new">+++</xsl:element>
      </xsl:element>
    </xsl:variable>

    <xsl:variable
      name="substitution_instructions_sorted">
      <xsl:perform-sort>
        <xsl:sort select="max_match_length"
          data-type="number" order="descending"/>
        <xsl:sort select="old" order="ascending"/>
        <xsl:for-each select="$substitution_instructions/substitution">
          <xsl:sequence
            select="my:annex-max-match-length-to-instruction($input, .)"/>
        </xsl:for-each>
      </xsl:perform-sort>
    </xsl:variable>

<xsl:message>
<xsl:value-of
select="concat('input: ', $input, '&#10;')"/>
<xsl:value-of
select="'========================================&#10;'"/>
<xsl:for-each
select="$substitution_instructions_sorted/substitution">
<xsl:value-of
select="concat('regex: ', old, '&#10;')"/>
<xsl:value-of
select="concat('max. match length: ', max_match_length, '&#10;')"/>
<xsl:value-of
select="'----------------------------------------&#10;'"/>
</xsl:for-each>
</xsl:message>


</xsl:template>

</xsl:stylesheet>

Current Thread