[xsl] Re: Anyone implemented a fuzzy matcher in XPath?

Subject: [xsl] Re: Anyone implemented a fuzzy matcher in XPath?
From: "Roger L. Cauvin" <roger@xxxxxxxxxx>
Date: Wed, 30 Jan 2013 05:15:38 -0600
"Costello, Roger L." <costello@xxxxxxxxx> wrote:

Has anyone implemented a fuzzy matcher (approximate string
matcher [1]) in XPath?

I implemented one, based on pairwise string alignment, as an XSLT 1.0 template. The compare-strings template takes as parameters two strings and returns a score between 0 and 1 representing the closeness of the match:


<?xml version="1.0" ?>

<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform";
                              xmlns:str="http://exslt.org/strings";
                              extension-element-prefixes="str">

  <xsl:variable name="lower-case-letters">abcdefghijklmnopqrstuvwxyz</xsl:variable>
  <xsl:variable name="upper-case-letters">ABCDEFGHIJKLMNOPQRSTUVWXYZ</xsl:variable>

  <xsl:template name="compare-strings">
    <xsl:param name="string1"/>
    <xsl:param name="string2"/>

    <xsl:variable name="pairs1">
      <xsl:call-template name="get-word-letter-pairs">
        <xsl:with-param name="string" select="normalize-space(translate($string1, $lower-case-letters, $upper-case-letters))"/>
      </xsl:call-template>
    </xsl:variable>

    <xsl:variable name="pairs2">
      <xsl:call-template name="get-word-letter-pairs">
        <xsl:with-param name="string" select="normalize-space(translate($string2, $lower-case-letters, $upper-case-letters))"/>
      </xsl:call-template>
    </xsl:variable>

    <xsl:call-template name="compare-pairs">
      <xsl:with-param name="pairs1" select="$pairs1"/>
      <xsl:with-param name="pairs2" select="$pairs2"/>
    </xsl:call-template>

</xsl:template>

  <xsl:template name="compare-pairs">
    <xsl:param name="pairs1"/>
    <xsl:param name="pairs2"/>

    <xsl:variable name="num-pairs1" select="string-length($pairs1) div 3"/>
    <xsl:variable name="num-pairs2" select="string-length($pairs2) div 3"/>
    <xsl:variable name="union" select="$num-pairs1 + $num-pairs2"/>

    <xsl:variable name="intersection">
      <xsl:call-template name="intersect-remaining-pairs">
        <xsl:with-param name="pairs1" select="$pairs1"/>
        <xsl:with-param name="pairs2" select="$pairs2"/>
      </xsl:call-template>
    </xsl:variable>

<xsl:value-of select="2.0 * $intersection div $union"/>

</xsl:template>

  <xsl:template name="intersect-remaining-pairs">
    <xsl:param name="pairs1"/>
    <xsl:param name="pairs2"/>
    <xsl:param name="intersection">0</xsl:param>

    <xsl:variable name="pair" select="substring-before($pairs1, ' ')"/>
    <xsl:choose>
      <xsl:when test="$pair = ''">
        <xsl:value-of select="$intersection"/>
      </xsl:when>
      <xsl:when test="contains($pairs2, $pair)">
        <xsl:call-template name="intersect-remaining-pairs">
          <xsl:with-param name="pairs1" select="substring-after($pairs1, ' ')"/>
          <xsl:with-param name="pairs2" select="concat(substring-before($pairs2, $pair), substring-after($pairs2, concat($pair, ' ')))"/>
          <xsl:with-param name="intersection" select="$intersection + 1"/>
        </xsl:call-template>
      </xsl:when>
      <xsl:otherwise>
        <xsl:call-template name="intersect-remaining-pairs">
          <xsl:with-param name="pairs1" select="substring-after($pairs1, ' ')"/>
          <xsl:with-param name="pairs2" select="$pairs2"/>
          <xsl:with-param name="intersection" select="$intersection"/>
        </xsl:call-template>
      </xsl:otherwise>
    </xsl:choose>

</xsl:template>

  <xsl:template name="get-word-letter-pairs">
    <xsl:param name="string"/>
    <xsl:param name="pairs"></xsl:param>

    <xsl:choose>
      <xsl:when test="$string = ''">
        <xsl:value-of select="$pairs"/>
      </xsl:when>
      <xsl:otherwise>
        <xsl:variable name="word">
          <xsl:choose>
            <xsl:when test="contains($string, ' ')">
              <xsl:value-of select="substring-before($string, ' ')"/>
            </xsl:when>
            <xsl:otherwise>
              <xsl:value-of select="$string"/>
            </xsl:otherwise>
          </xsl:choose>
        </xsl:variable>
        <xsl:variable name="letter-pairs">
          <xsl:call-template name="get-letter-pairs">
            <xsl:with-param name="word" select="$word"/>
          </xsl:call-template>
        </xsl:variable>
        <xsl:call-template name="get-word-letter-pairs">
          <xsl:with-param name="string" select="substring-after($string, ' ')"/>
          <xsl:with-param name="pairs" select="concat($pairs, $letter-pairs)"/>
        </xsl:call-template>
      </xsl:otherwise>
    </xsl:choose>
  </xsl:template>

  <xsl:template name="get-letter-pairs">
    <xsl:param name="word"/>
    <xsl:param name="pairs"></xsl:param>

    <xsl:choose>
      <xsl:when test="string-length($word) &lt; 2">
        <xsl:value-of select="$pairs"/>
      </xsl:when>
      <xsl:otherwise>
        <xsl:call-template name="get-letter-pairs">
          <xsl:with-param name="word" select="substring($word, 2, string-length($word) - 1)"/>
          <xsl:with-param name="pairs" select="concat($pairs, substring($word, 1, 2), ' ')"/>
        </xsl:call-template>
      </xsl:otherwise>
    </xsl:choose>
  </xsl:template>

</xsl:stylesheet>

--
Roger L. Cauvin
"Smart product decisions"
Twitter: @rcauvin
http://blog.cauvin.org

Current Thread