[xsl] Improving memory usage when processing multiple documents

Subject: [xsl] Improving memory usage when processing multiple documents
From: Øyvind Gjesdal <oyvindlg@xxxxxxxxx>
Date: Tue, 28 Jun 2011 14:55:05 +0200
Hey xslt-list.

We are currently working on transforming old digital copies of parish
records from old datasources into xml-files. The main tool used for
this transformation has been OxygenXML and SAXON EE 9.3.0.5. Now I'd
like to make a simple style sheet that looks up the
kyrre/registrering/grunnlag element in the new files we've created,
put the old file names in a variable, and look up in the old meta
table for file names not present in the variable. The look up works at
the moment, but I meet a l wall at 3,2 gb of memory usage.  I have
tried to implement the saxon:stream function without success. At the
moment I'm stuck with working around the problem by dividing the jobs,
but since this isn't the first time I've encountered this type of
memory issue working on the data sets, I'd rather ask for help and
hopefully learn. Below is example source of files.

Best regards,

Xyvind Gjesdal,

Digital Archives, Statsarkivet i Bergen

Source document input: restfiler_kyrre.xml - metadata for the original
source files before transformation. (2200 rows)

doc() list.xml - list containing iri to transformed files to look up (2000
rows)

doc()* individual_document_from_list.xml  (all of list excepting
failed transforms.Ranges between 25kb and 100+mb)

files_not_transformed.xsl - stylesheet for looking up which files are
missing from the old archives.

Files:

files_not_transformed.xsl

<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet
xmlns:xsl="http://www.w3.org/1999/XSL/Transform";
xmlns:xs="http://www.w3.org/2001/XMLSchema"; exclude-result-prefixes="xs"
xmlns:xd="http://www.oxygenxml.com/ns/doc/xsl";
xmlns:xsffil="http://www.statsarkivet/bergen/fil";
xmlns:saxon="http://saxon.sf.net/";
version="2.0">

<xsl:output indent="yes"/>

<xsl:template match="row" mode="nye_filer">
<xsl:if test="doc-available((concat('file:/',iri-to-uri(.))))">

<xsl:variable name="nyFil" as="node()"><xsl:sequence
select="saxon:stream(doc(concat('file:/',iri-to-uri(.)))/kyrre/registrering/g
runnlag)"/>
</xsl:variable>
<xsl:value-of select="xsffil:get_old_filename($nyFil)"/>
</xsl:if>
</xsl:template>

<xsl:template match="root">
<xsl:variable name="old_filenames" as="xs:string*">
<xsl:element name="root">
<xsl:apply-templates
select="saxon:stream(doc('file:/A:/list.xml')/*/row)"
mode="nye_filer"/>
</xsl:element></xsl:variable>
<xsl:element name="root">


<xsl:apply-templates select="row">
<xsl:with-param name="old_filenames" select="$old_filenames"/>
</xsl:apply-templates>

</xsl:element>
</xsl:template>

<xsl:template match="row">
<xsl:param name="old_filenames"/>
<xsl:if test="not(some $verdie in $old_filenames satisfies
lower-case($verdie) eq lower-case(concat(Filnamn,'.xml')))">
<xsl:element name="fil"><xsl:attribute name="filnamn"
select="Filnamn"></xsl:attribute><xsl:value-of
select="Databasenamn/text()"></xsl:value-of></xsl:element>
</xsl:if>
</xsl:template>

<xsl:function name="xsffil:get_old_filename" as="xs:string">
<xsl:param name="fil" as="node()"/>
<xsl:variable name="quote">
<xsl:text>'</xsl:text>
</xsl:variable>
<xsl:sequence select="(tokenize($fil/text(),$quote)[2],'')[1]"/>
</xsl:function>
</xsl:stylesheet>

restfiler_kyrre.xml

<row nr="1">
<Databasenamn>Dxypte i Gran prestegjeld 1787-1804</Databasenamn>
<EngelskNamn>Baptisms in Gran parish 1787-1904</EngelskNamn>
<DatoOppretta>12.12.2002</DatoOppretta>
<Eigar>GDS-Hadeland</Eigar>
<Pensjonatet>digitalpensjonatet\gdshade.htm</Pensjonatet>
<Pensjonatnamn>GDS-Hadeland</Pensjonatnamn>
<PensjonatEngelskNamn>GDS-Hadeland</PensjonatEngelskNamn>
<global>0</global>
<metanr>2451</metanr>
<Doknamn/>
<Filnamn>Dp05341788</Filnamn>
<kategori>4</kategori>
<emne>1</emne>
<FAar>1787</FAar>
<SAar>1804</SAar>
<AntFylke>1</AntFylke>
<Fylke>5</Fylke>
<AntKommune>0</AntKommune>
<Kommune>0</Kommune>
<AntNoFylke>0</AntNoFylke>
<NoFylke>0</NoFylke>
<AntNoKommune>0</AntNoKommune>
<NoKommune>0</NoKommune>
<passiv>0</passiv>
<Heading24>0</Heading24>
<Heading25>0</Heading25>
<Heading26>0</Heading26>
<Heading27>0</Heading27>
<Heading28>0</Heading28>
<Heading29>0</Heading29>
<Heading30>0</Heading30>
<Heading31>0</Heading31>
<Heading32>0</Heading32>
<Heading33>0</Heading33>
<Heading34>0</Heading34>
<Heading35>0</Heading35>
<Heading36>1</Heading36>
<Heading37>534</Heading37>
</row>

</root>

list.xml

<root>
<row>I:/USERDIRS/xyvind G/structure/data/in/out/da/kjeldeid/vi_Stokke
MINI 1.xml</row>
<row>I:/USERDIRS/xyvind G/structure/data/in/out/da/kjeldeid/gr_Stord
MINI A 5.xml</row>
<row>I:/USERDIRS/xyvind G/structure/data/in/out/da/kjeldeid/dp_Orkdal
MINI 668A06.xml</row>
<row>I:/USERDIRS/xyvind G/structure/data/in/out/da/kjeldeid/kf_Stord
MINI A 2.xml</row>
<row>I:/USERDIRS/xyvind G/structure/data/in/out/da/kjeldeid/vi_Lindes
MINI A 17.xml</row>

</root>

individual_document_from_list.xml

<kyrre>
<kjelde kjeldeid="7002">
<prgjeld>Fitjar</prgjeld>
<kommnr>1222</kommnr>
<soknenr>vi12221862.htm</soknenr>
<ktype>MINI</ktype>
<signatur>A 1</signatur>
<ltype>vi</ltype>
<startaar>1862</startaar>
<sluttaar>1881</sluttaar>
<oppbstad/>
<merknader>Vigde i Fitjar 1862-1925 {prot_ref:}fitjar#MINI#a1#</merknader>
</kjelde>
<registrering>
<reg_av/>
<foretak>Digitalarkivet</foretak>
<kontakt/>
<grunnlag>Fil 'Vi12221862.xml' fra gamle Digitalarkivet.</grunnlag>
<program/>
<fildato>30.12.2004</fildato>
</registrering>
<vi vinr="1">
<side>166</side>
<lopenr>1</lopenr>
<aar>1862</aar>
<vidato>08.02</vidato>
<vi_person pnr="1">
<rolle>Brudgom</rolle>
<stilling_stand>Styrmand</stilling_stand>
<forenamn>Johan Christopher</forenamn>
<etternamn>Meyer</etternamn>
<kjonn>m</kjonn>
<alder>58</alder>
<fodestad>Bergen</fodestad>
</vi_person>
<vi_person pnr="2">
<rolle>Brur</rolle>
<stilling_stand>Jomfru</stilling_stand>
<forenamn>Anne Cathrine</forenamn>
<etternamn>Hagelsteen</etternamn>
<kjonn>k</kjonn>
<alder>27</alder>
<fodestad>Bergen</fodestad>
</vi_person>
<vi_person pnr="3">
<rolle>Brugdom far</rolle>
<stilling_stand>Snedkermester</stilling_stand>
<forenamn>Wilken</forenamn>
<etternamn>Meyer</etternamn>
<kjonn/>
</vi_person>
<vi_person pnr="4">
<rolle>Brur far</rolle>
<stilling_stand>Fiske??</stilling_stand>
<forenamn>Sxren S.</forenamn>
<etternamn>Hagelsteen</etternamn>
<kjonn>m</kjonn>
</vi_person>
</vi>

</kyrre>

Current Thread