[xsl] complex transform vs. generated transform performance..

Subject: [xsl] complex transform vs. generated transform performance..
From: Brian Chrisman <incubus@xxxxxxxxxxxxxx>
Date: Sun, 5 Jun 2005 15:52:58 -0700
Hello All,
 
I slapped together a quick XSLt script to convert an XML document 
with a simple flat structure into a postgres-optimized table-load script.
I had a schema document (XSD), so I used that to defined the transformation,
but I also used the assumption that the schema was a simple 2-level document.

The XML is a reasonable size .. 19MB.
My first attempt is a stylesheet which dynamically used the information
from the XSD to pull the fields from the XML.  This was taking > 30 minutes 
to run. 
It seemed to work, though I only ran it on a few similarly formatted test
documents. (The smallest of which, I'm including below)
(I used sablotron... XSLT1.0..  xsltproc/libxslt was taking much longer)

In the second attempt, I converted that stylesheet to one which generated
a specific stylesheet for that XML, which did the transformation in 30 seconds.
(The results diff'ed to be identical, minus one newline).

In general, I lean towards 1-stage processes over 2-stage processes when
I don't see a real usefulness to the intermediate stage (ie. modularizing
functionality).

If anybody spots a way to optimize this first stylesheet, I'd appreciate it.

If the schema document I'm including below is too big or verbose, I could
find a way to cut the example down a bit... in general though, it's just
a flat space, like the XML example below it.

Thanks,
Brian Chrisman


(hopefully my long lines or other formatting irregularities don't cause a 
huge problem here..)
-- xsd2insertgenerator.xsl
<xsl:stylesheet 
	xmlns:xsl="http://www.w3.org/1999/XSL/Transform"; 
	xmlns:xsd="http://www.w3.org/2001/XMLSchema"; 
	xmlns:od="urn:schemas-microsoft-com:officedata"
	exclude-result-prefixes="xsd od"
	version="1.0">
	<xsl:output method="text" omit-xml-declaration="yes"/>

	<!-- just imports the data, does not create the table.. etc -->

	<xsl:variable name="schema" select="document('/www/MAD/MAD.xsd')"/>
	<xsl:variable name="table" select="$schema/xsd:schema/xsd:element[1]//@ref"/>
	<xsl:variable name="element-list" select="$schema/xsd:schema/xsd:element[@name=$schema/xsd:schema/xsd:element[1]//@ref]//xsd:element"/>

	<!-- some fields may exist *only* in the schema document -->
	<xsl:variable name="fields" select="$schema/xsd:schema/xsd:element[@name=/xsd:schema/xsd:element[1]//@ref]//xsd:element"/>

	<xsl:template match="/">
		<xsl:value-of select="concat('COPY &quot;', $table, '&quot; FROM stdin;&#xA;')"/>
		<xsl:apply-templates select="/dataroot/*" mode="one-record"/>  <!-- Flat space imports only -->
		<xsl:text>\.</xsl:text>
	</xsl:template>

	<xsl:template match="*" mode="one-record">
		<xsl:variable name="record" select="."/>
		<!-- Note: below I am looping over the entries in the XSD doc, and looking up corresponding values in the XML record -->
		<xsl:for-each select="$element-list">
			<xsl:value-of select="$record/*[local-name(.) = current()/@name]"/>
			<xsl:if test="position() &lt; last()"><xsl:text>&#x9;</xsl:text></xsl:if>
		</xsl:for-each>
		<xsl:text>&#xA;</xsl:text>
	</xsl:template>
		
</xsl:stylesheet>

-- xsd2insertgeneratorgenerator.xsl
<xsl:stylesheet
	version="1.0"
	xmlns:xsl="http://www.w3.org/1999/XSL/Transform";
	xmlns:gen="dummy-namespace-for-the-generated-xslt"
	xmlns:xsd="http://www.w3.org/2001/XMLSchema"; 
	xmlns:od="urn:schemas-microsoft-com:officedata"
	exclude-result-prefixes="xsd od xsl">
	<xsl:output method="xml" indent="yes"/>
	
	<xsl:namespace-alias stylesheet-prefix="gen" result-prefix="xsl"/>
	<xsl:variable name="schema" select="document('/www/MAD/MAD.xsd')"/>
	<xsl:variable name="table" select="$schema/xsd:schema/xsd:element[1]//@ref"/>
	<xsl:variable name="element-list" select="$schema/xsd:schema/xsd:element[@name=$schema/xsd:schema/xsd:element[1]//@ref]//xsd:element"/>

	<xsl:template match="/">
		<gen:stylesheet version="1.0">
		<gen:output method="txt" omit-xml-declaration="yes"/>

			<gen:template match="/">
				<gen:value-of select="concat('COPY &quot;', '{$table}', '&quot; FROM stdin;&#xA;')"/>
				<gen:apply-templates select="/dataroot/{$table}"/>  <!-- Flat space imports only -->
				<gen:text>\.</gen:text>
				<gen:for-each select="{$table}">
					<xsl:for-each select="$element-list">
						<gen:value-of select="{@name}"/>
						<xsl:if test="position() &lt; last()">
							<gen:text><xsl:value-of select="'&#x9;'"/></gen:text>
						</xsl:if>
					</xsl:for-each>
					<gen:text><xsl:value-of select="'&#xA;'"/></gen:text>
				</gen:for-each>
			</gen:template>

<!--
			<gen:template match="{$table}">
				<xsl:for-each select="$element-list">
					<gen:value-of select="{@name}"/>
					<xsl:if test="position() &lt; last()">
						<gen:text><xsl:value-of select="'&#x9;'"/></gen:text>
					</xsl:if>
				</xsl:for-each>
				<gen:text><xsl:value-of select="'&#xA;'"/></gen:text>
			</gen:template>
-->
		</gen:stylesheet>
	</xsl:template>
</xsl:stylesheet>



-- MAD.xsd
<?xml version="1.0" encoding="UTF-8"?>
<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema"; xmlns:od="urn:schemas-microsoft-com:officedata">
  <xsd:element name="dataroot">
    <xsd:complexType>
      <xsd:sequence>
        <xsd:element ref="MAD" minOccurs="0" maxOccurs="unbounded"/>
      </xsd:sequence>
      <xsd:attribute name="generated" type="xsd:dateTime"/>
    </xsd:complexType>
  </xsd:element>
  <xsd:element name="MAD">
    <xsd:annotation>
      <xsd:appinfo/>
    </xsd:annotation>
    <xsd:complexType>
      <xsd:sequence>
        <xsd:element name="ADDRESS_ID" minOccurs="1" od:jetType="double" od:sqlSType="float" od:nonNullable="yes" type="xsd:double"/>
        <xsd:element name="APN" minOccurs="0" od:jetType="text" od:sqlSType="nvarchar">
          <xsd:simpleType>
            <xsd:restriction base="xsd:string">
              <xsd:maxLength value="14"/>
            </xsd:restriction>
          </xsd:simpleType>
        </xsd:element>
        <xsd:element name="PREFIX_TYPE" minOccurs="0" od:jetType="text" od:sqlSType="nvarchar">
          <xsd:simpleType>
            <xsd:restriction base="xsd:string">
              <xsd:maxLength value="15"/>
            </xsd:restriction>
          </xsd:simpleType>
        </xsd:element>
        <xsd:element name="PREFIX_DIR" minOccurs="0" od:jetType="text" od:sqlSType="nvarchar">
          <xsd:simpleType>
            <xsd:restriction base="xsd:string">
              <xsd:maxLength value="5"/>
            </xsd:restriction>
          </xsd:simpleType>
        </xsd:element>
        <xsd:element name="ST_NUMBER" minOccurs="0" od:jetType="double" od:sqlSType="float" type="xsd:double"/>
        <xsd:element name="ST_NAME" minOccurs="0" od:jetType="text" od:sqlSType="nvarchar">
          <xsd:simpleType>
            <xsd:restriction base="xsd:string">
              <xsd:maxLength value="30"/>
            </xsd:restriction>
          </xsd:simpleType>
        </xsd:element>
        <xsd:element name="ST_TYPE" minOccurs="0" od:jetType="text" od:sqlSType="nvarchar">
          <xsd:simpleType>
            <xsd:restriction base="xsd:string">
              <xsd:maxLength value="10"/>
            </xsd:restriction>
          </xsd:simpleType>
        </xsd:element>
        <xsd:element name="SUFFIX_DIR" minOccurs="0" od:jetType="text" od:sqlSType="nvarchar">
          <xsd:simpleType>
            <xsd:restriction base="xsd:string">
              <xsd:maxLength value="5"/>
            </xsd:restriction>
          </xsd:simpleType>
        </xsd:element>
        <xsd:element name="CITY" minOccurs="0" od:jetType="text" od:sqlSType="nvarchar">
          <xsd:simpleType>
            <xsd:restriction base="xsd:string">
              <xsd:maxLength value="25"/>
            </xsd:restriction>
          </xsd:simpleType>
        </xsd:element>
        <xsd:element name="ZIP" minOccurs="0" od:jetType="text" od:sqlSType="nvarchar">
          <xsd:simpleType>
            <xsd:restriction base="xsd:string">
              <xsd:maxLength value="10"/>
            </xsd:restriction>
          </xsd:simpleType>
        </xsd:element>
        <xsd:element name="LONG_ADDRESS" minOccurs="0" od:jetType="text" od:sqlSType="nvarchar">
          <xsd:simpleType>
            <xsd:restriction base="xsd:string">
              <xsd:maxLength value="100"/>
            </xsd:restriction>
          </xsd:simpleType>
        </xsd:element>
        <xsd:element name="ADDR_FRACTION" minOccurs="0" od:jetType="text" od:sqlSType="nvarchar">
          <xsd:simpleType>
            <xsd:restriction base="xsd:string">
              <xsd:maxLength value="5"/>
            </xsd:restriction>
          </xsd:simpleType>
        </xsd:element>
        <xsd:element name="UNIT_DESIGNATOR" minOccurs="0" od:jetType="text" od:sqlSType="nvarchar">
          <xsd:simpleType>
            <xsd:restriction base="xsd:string">
              <xsd:maxLength value="10"/>
            </xsd:restriction>
          </xsd:simpleType>
        </xsd:element>
        <xsd:element name="UNIT" minOccurs="0" od:jetType="text" od:sqlSType="nvarchar">
          <xsd:simpleType>
            <xsd:restriction base="xsd:string">
              <xsd:maxLength value="10"/>
            </xsd:restriction>
          </xsd:simpleType>
        </xsd:element>
        <xsd:element name="FLOOR" minOccurs="0" od:jetType="text" od:sqlSType="nvarchar">
          <xsd:simpleType>
            <xsd:restriction base="xsd:string">
              <xsd:maxLength value="5"/>
            </xsd:restriction>
          </xsd:simpleType>
        </xsd:element>
        <xsd:element name="BUILDING" minOccurs="0" od:jetType="text" od:sqlSType="nvarchar">
          <xsd:simpleType>
            <xsd:restriction base="xsd:string">
              <xsd:maxLength value="30"/>
            </xsd:restriction>
          </xsd:simpleType>
        </xsd:element>
        <xsd:element name="ADDRESS_TYPE" minOccurs="0" od:jetType="text" od:sqlSType="nvarchar">
          <xsd:simpleType>
            <xsd:restriction base="xsd:string">
              <xsd:maxLength value="10"/>
            </xsd:restriction>
          </xsd:simpleType>
        </xsd:element>
        <xsd:element name="ADDRESS_STATUS" minOccurs="0" od:jetType="text" od:sqlSType="nvarchar">
          <xsd:simpleType>
            <xsd:restriction base="xsd:string">
              <xsd:maxLength value="20"/>
            </xsd:restriction>
          </xsd:simpleType>
        </xsd:element>
        <xsd:element name="ADDRESS_USE" minOccurs="0" od:jetType="text" od:sqlSType="nvarchar">
          <xsd:simpleType>
            <xsd:restriction base="xsd:string">
              <xsd:maxLength value="25"/>
            </xsd:restriction>
          </xsd:simpleType>
        </xsd:element>
        <xsd:element name="PRIVATE" minOccurs="0" od:jetType="text" od:sqlSType="nvarchar">
          <xsd:simpleType>
            <xsd:restriction base="xsd:string">
              <xsd:maxLength value="10"/>
            </xsd:restriction>
          </xsd:simpleType>
        </xsd:element>
        <xsd:element name="INCORPORATED" minOccurs="0" od:jetType="text" od:sqlSType="nvarchar">
          <xsd:simpleType>
            <xsd:restriction base="xsd:string">
              <xsd:maxLength value="10"/>
            </xsd:restriction>
          </xsd:simpleType>
        </xsd:element>
        <xsd:element name="X_COORD" minOccurs="0" od:jetType="double" od:sqlSType="float" type="xsd:double"/>
        <xsd:element name="Y_COORD" minOccurs="0" od:jetType="double" od:sqlSType="float" type="xsd:double"/>
        <xsd:element name="DATE_ENTERED" minOccurs="0" od:jetType="datetime" od:sqlSType="datetime" type="xsd:dateTime"/>
        <xsd:element name="DATE_UPDATED" minOccurs="0" od:jetType="datetime" od:sqlSType="datetime" type="xsd:dateTime"/>
        <xsd:element name="DATE_RETIRED" minOccurs="0" od:jetType="datetime" od:sqlSType="datetime" type="xsd:dateTime"/>
        <xsd:element name="SOURCE" minOccurs="0" od:jetType="text" od:sqlSType="nvarchar">
          <xsd:simpleType>
            <xsd:restriction base="xsd:string">
              <xsd:maxLength value="30"/>
            </xsd:restriction>
          </xsd:simpleType>
        </xsd:element>
        <xsd:element name="BUS_LIC_FLAG" minOccurs="0" od:jetType="text" od:sqlSType="nvarchar">
          <xsd:simpleType>
            <xsd:restriction base="xsd:string">
              <xsd:maxLength value="10"/>
            </xsd:restriction>
          </xsd:simpleType>
        </xsd:element>
        <xsd:element name="SERC_REQ_FLAG" minOccurs="0" od:jetType="text" od:sqlSType="nvarchar">
          <xsd:simpleType>
            <xsd:restriction base="xsd:string">
              <xsd:maxLength value="15"/>
            </xsd:restriction>
          </xsd:simpleType>
        </xsd:element>
        <xsd:element name="PROJECT_FLAG" minOccurs="0" od:jetType="text" od:sqlSType="nvarchar">
          <xsd:simpleType>
            <xsd:restriction base="xsd:string">
              <xsd:maxLength value="15"/>
            </xsd:restriction>
          </xsd:simpleType>
        </xsd:element>
        <xsd:element name="PERMIT_FLAG" minOccurs="0" od:jetType="text" od:sqlSType="nvarchar">
          <xsd:simpleType>
            <xsd:restriction base="xsd:string">
              <xsd:maxLength value="15"/>
            </xsd:restriction>
          </xsd:simpleType>
        </xsd:element>
        <xsd:element name="ZIP4_FLAG" minOccurs="0" od:jetType="text" od:sqlSType="nvarchar">
          <xsd:simpleType>
            <xsd:restriction base="xsd:string">
              <xsd:maxLength value="1"/>
            </xsd:restriction>
          </xsd:simpleType>
        </xsd:element>
        <xsd:element name="COORD_SOURCE" minOccurs="0" od:jetType="text" od:sqlSType="nvarchar">
          <xsd:simpleType>
            <xsd:restriction base="xsd:string">
              <xsd:maxLength value="10"/>
            </xsd:restriction>
          </xsd:simpleType>
        </xsd:element>
      </xsd:sequence>
    </xsd:complexType>
  </xsd:element>
</xsd:schema>

-- foo.xml (test data)
<?xml version="1.0" encoding="UTF-8"?>
<dataroot xmlns:od="urn:schemas-microsoft-com:officedata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"; xsi:noNamespaceSchemaLocation="MAD.xsd" generated="2005-04-29T15:13:49">
  <MAD>
    <ADDRESS_ID>1</ADDRESS_ID>
    <APN>22400900350000</APN>
    <ST_NUMBER>7453</ST_NUMBER>
    <ST_NAME>TIARA</ST_NAME>
    <ST_TYPE>WAY</ST_TYPE>
    <CITY>ORANGEVALE</CITY>
    <ZIP>95662</ZIP>
    <LONG_ADDRESS>7453 TIARA WAY       B</LONG_ADDRESS>
    <UNIT>B</UNIT>
    <X_COORD>6768867.42298</X_COORD>
    <Y_COORD>2018154.73453</Y_COORD>
    <DATE_ENTERED>2005-04-19T00:00:00</DATE_ENTERED>
    <SOURCE>GISDROP_20050225</SOURCE>
    <BUS_LIC_FLAG>N</BUS_LIC_FLAG>
    <SERC_REQ_FLAG>N</SERC_REQ_FLAG>
    <PROJECT_FLAG>N</PROJECT_FLAG>
    <PERMIT_FLAG>Y</PERMIT_FLAG>
    <ZIP4_FLAG>N</ZIP4_FLAG>
    <COORD_SOURCE>CENTROID</COORD_SOURCE>
  </MAD>
  <MAD>
    <ADDRESS_ID>8</ADDRESS_ID>
    <APN>24301920130000</APN>
    <ST_NUMBER>6054</ST_NUMBER>
    <ST_NAME>SUNRISE</ST_NAME>
    <ST_TYPE>MALL</ST_TYPE>
    <CITY>CITRUS HEIGHTS</CITY>
    <ZIP>95610</ZIP>
    <LONG_ADDRESS>6054 SUNRISE MALL       B7</LONG_ADDRESS>
    <UNIT>B7</UNIT>
    <X_COORD>6770356.0616</X_COORD>
    <Y_COORD>2007415.14451</Y_COORD>
    <DATE_ENTERED>2005-04-19T00:00:00</DATE_ENTERED>
    <SOURCE>GISDROP_20050225</SOURCE>
    <BUS_LIC_FLAG>N</BUS_LIC_FLAG>
    <SERC_REQ_FLAG>N</SERC_REQ_FLAG>
    <PROJECT_FLAG>N</PROJECT_FLAG>
    <PERMIT_FLAG>Y</PERMIT_FLAG>
    <ZIP4_FLAG>Y</ZIP4_FLAG>
    <COORD_SOURCE>CENTROID</COORD_SOURCE>
  </MAD>
  <MAD>
    <ADDRESS_ID>9</ADDRESS_ID>
    <APN>24301500230000</APN>
    <ST_NUMBER>7669</ST_NUMBER>
    <ST_NAME>GREENBACK</ST_NAME>
    <ST_TYPE>LN</ST_TYPE>
    <ZIP>0</ZIP>
    <LONG_ADDRESS>7669 GREENBACK LN       2002</LONG_ADDRESS>
    <UNIT>2002</UNIT>
    <X_COORD>6766774.11111</X_COORD>
    <Y_COORD>2010059.02536</Y_COORD>
    <DATE_ENTERED>2005-04-19T00:00:00</DATE_ENTERED>
    <SOURCE>GISDROP_20050225</SOURCE>
    <BUS_LIC_FLAG>N</BUS_LIC_FLAG>
    <SERC_REQ_FLAG>N</SERC_REQ_FLAG>
    <PROJECT_FLAG>N</PROJECT_FLAG>
    <PERMIT_FLAG>Y</PERMIT_FLAG>
    <ZIP4_FLAG>N</ZIP4_FLAG>
    <COORD_SOURCE>CENTROID</COORD_SOURCE>
  </MAD>
</dataroot>

Current Thread