dkpro · mawiesne · Jan 17, 2026 · Apr 24, 2026 · Apr 24, 2026 · Apr 24, 2026
diff --git a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/domain/DataMachineFiles.java b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/domain/DataMachineFiles.java
@@ -19,14 +19,25 @@
 
 import java.io.File;
 import java.io.FileFilter;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
 
 import org.dkpro.jwpl.wikimachine.debug.ILogger;
 import org.dkpro.jwpl.wikimachine.domain.Files;
+import org.dkpro.jwpl.wikimachine.util.DumpFileDiscovery;
 
 /**
  * A {@link Files} implementation specific for the DataMachine tool.
  * It defines file name constants and provides methods for
  * input/output directory building rules and checks.
+ * <p>
+ * Wikimedia publishes large XML dumps split across several files (see
+ * {@link DumpFileDiscovery}). For the {@code pages-articles} and {@code pages-meta-current}
+ * roles this class keeps the ordered list of parts and exposes both the legacy singular
+ * getter (first part of the ordered list, for backwards compatibility) and a list getter
+ * that returns every part.
  *
  * @see Files
  */
@@ -35,8 +46,8 @@ public class DataMachineFiles
 {
     private final static String INPUT_PAGELINKS = "pagelinks.sql";
     private final static String INPUT_CATEGORYLINKS = "categorylinks.sql";
-    private final static String INPUT_PAGESARTICLES = "pages-articles.xml";
-    private final static String INPUT_PAGESMETACURRENT = "pages-meta-current.xml";
+    private final static String INPUT_PAGESARTICLES = "pages-articles";
+    private final static String INPUT_PAGESMETACURRENT = "pages-meta-current";
 
     private final static String GENERATED_PAGE = "page.bin";
     private final static String GENERATED_REVISION = "revision.bin";
@@ -48,13 +59,15 @@ public class DataMachineFiles
 
     private final static String ARCHIVE_EXTENSION = ".gz";
 
+    private final static Set<String> SUPPORTED_EXTENSIONS = Set.of("bz2", "gz", "7z");
+
     private File dataDirectory = new File(".");
     private boolean compressGeneratedFiles = false;
 
     private File inputPagelinks = null;
-    private File inputPagesarticles = null;
     private File inputCategorylinks = null;
-    private File inputPagesMetaCurrent = null;
+    private List<File> inputPagesarticles = new ArrayList<>();
+    private List<File> inputPagesMetaCurrent = new ArrayList<>();
 
     /**
      * Instantiates a {@link Files} object with the specified {@code logger}.
@@ -77,9 +90,9 @@ public DataMachineFiles(DataMachineFiles files)
         super(files);
         this.dataDirectory = files.dataDirectory;
         this.inputPagelinks = files.inputPagelinks;
-        this.inputPagesarticles = files.inputPagesarticles;
+        this.inputPagesarticles = new ArrayList<>(files.inputPagesarticles);
         this.inputCategorylinks = files.inputCategorylinks;
-        this.inputPagesMetaCurrent = files.inputPagesMetaCurrent;
+        this.inputPagesMetaCurrent = new ArrayList<>(files.inputPagesMetaCurrent);
         this.compressGeneratedFiles = files.compressGeneratedFiles;
     }
 
@@ -108,30 +121,34 @@ private boolean checkDataMachineSourceFiles()
     {
         final FileFilter supportedFormatFilter = file -> {
             final String name = file.getName();
-            // See UniversalDecompressor for all built-in decompression formats. For now:
             return name.endsWith(".7z") || name.endsWith(".gz") || name.endsWith(".bz2");
         };
         final File[] files = dataDirectory.listFiles(supportedFormatFilter);
-        if (files != null && files.length > 2) {
+        if (files != null && files.length >= 3) {
+            final List<File> articleParts = new ArrayList<>();
+            final List<File> metaCurrentParts = new ArrayList<>();
             for (File currentFile : files) {
-                String currentFileName = currentFile.getName();
-                if (currentFileName.contains(INPUT_PAGESARTICLES)) {
-                    inputPagesarticles = currentFile;
+                final String name = currentFile.getName();
+                if (DumpFileDiscovery.matchesRole(name, INPUT_PAGESARTICLES, SUPPORTED_EXTENSIONS)) {
+                    articleParts.add(currentFile);
+                }
+                else if (DumpFileDiscovery.matchesRole(name, INPUT_PAGESMETACURRENT,
+                        SUPPORTED_EXTENSIONS)) {
+                    metaCurrentParts.add(currentFile);
                 }
-                else if (currentFileName.contains(INPUT_PAGELINKS)) {
+                else if (name.contains(INPUT_PAGELINKS)) {
                     inputPagelinks = currentFile;
                 }
-                else if (currentFileName.contains(INPUT_CATEGORYLINKS)) {
+                else if (name.contains(INPUT_CATEGORYLINKS)) {
                     inputCategorylinks = currentFile;
                 }
-                else if (currentFileName.contains(INPUT_PAGESMETACURRENT)) {
-                    inputPagesMetaCurrent = currentFile;
-                }
             }
+            inputPagesarticles = DumpFileDiscovery.orderByPageRange(articleParts);
+            inputPagesMetaCurrent = DumpFileDiscovery.orderByPageRange(metaCurrentParts);
         }
         // either inputPagesarticles or inputPagesMetaCurrent have to be placed
         // in the input directory
-        return !((inputPagesarticles == null && inputPagesMetaCurrent == null)
+        return !((inputPagesarticles.isEmpty() && inputPagesMetaCurrent.isEmpty())
                 || inputPagelinks == null || inputCategorylinks == null);
     }
 
@@ -179,14 +196,29 @@ public String getInputPageLinks()
     }
 
     /**
-     * @return Retrieves the absolute path of the {@code pages-articles.xml} file.
+     * @return Retrieves the absolute path of the first {@code pages-articles.xml} part,
+     *         or {@code null} if none was discovered. For multi-part dumps, prefer
+     *         {@link #getInputPagesArticlesFiles()}.
      */
     public String getInputPagesArticles()
     {
-        if (inputPagesarticles == null) {
+        if (inputPagesarticles.isEmpty()) {
             checkDataMachineSourceFiles();
         }
-        return inputPagesarticles != null ? inputPagesarticles.getAbsolutePath() : null;
+        return inputPagesarticles.isEmpty() ? null : inputPagesarticles.get(0).getAbsolutePath();
+    }
+
+    /**
+     * @return Absolute paths of all {@code pages-articles.xml} parts ordered by ascending page
+     *         range. Empty if the dump is not available. A single-file dump yields a list of
+     *         size 1.
+     */
+    public List<String> getInputPagesArticlesFiles()
+    {
+        if (inputPagesarticles.isEmpty()) {
+            checkDataMachineSourceFiles();
+        }
+        return toAbsolutePathList(inputPagesarticles);
     }
 
     /**
@@ -201,14 +233,41 @@ public String getInputCategoryLinks()
     }
 
     /**
-     * @return Retrieves the absolute path of the {@code pages-meta-current.xml} file.
+     * @return Retrieves the absolute path of the first {@code pages-meta-current.xml} part,
+     *         or {@code null} if none was discovered. For multi-part dumps, prefer
+     *         {@link #getInputPagesMetaCurrentFiles()}.
      */
     public String getInputPagesMetaCurrent()
     {
-        if (inputPagesMetaCurrent == null) {
+        if (inputPagesMetaCurrent.isEmpty()) {
+            checkDataMachineSourceFiles();
+        }
+        return inputPagesMetaCurrent.isEmpty() ? null
+                : inputPagesMetaCurrent.get(0).getAbsolutePath();
+    }
+
+    /**
+     * @return Absolute paths of all {@code pages-meta-current.xml} parts ordered by ascending
+     *         page range. Empty if the dump is not available.
+     */
+    public List<String> getInputPagesMetaCurrentFiles()
+    {
+        if (inputPagesMetaCurrent.isEmpty()) {
             checkDataMachineSourceFiles();
         }
-        return inputPagesMetaCurrent != null ? inputPagesMetaCurrent.getAbsolutePath() : null;
+        return toAbsolutePathList(inputPagesMetaCurrent);
+    }
+
+    private static List<String> toAbsolutePathList(List<File> files)
+    {
+        if (files.isEmpty()) {
+            return Collections.emptyList();
+        }
+        final List<String> paths = new ArrayList<>(files.size());
+        for (File f : files) {
+            paths.add(f.getAbsolutePath());
+        }
+        return paths;
     }
 
     private String getGeneratedPath(String fileName)

diff --git a/...wpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/domain/DataMachineGenerator.java b/...wpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/domain/DataMachineGenerator.java
@@ -18,6 +18,9 @@
 package org.dkpro.jwpl.datamachine.domain;
 
 import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
 
 import org.dkpro.jwpl.datamachine.dump.xml.XML2Binary;
 import org.dkpro.jwpl.wikimachine.domain.AbstractSnapshotGenerator;
@@ -85,7 +88,14 @@ private void processInputDump() throws IOException
     {
 
         logger.log("Parsing input dumps...");
-        new XML2Binary(decompressor.getInputStream(getPagesArticlesFile()), files);
+        final List<String> parts = getPagesArticlesFiles();
+        final List<InputStream> streams = new ArrayList<>(parts.size());
+        for (String part : parts) {
+            streams.add(decompressor.getInputStream(part));
+        }
+        // A single-file dump reduces to a one-element list; the multi-part XML2Binary
+        // constructor handles both cases uniformly via MultiPartXmlDumpReader.
+        new XML2Binary(streams, files);
 
         dumpVersionProcessor.setDumpVersions(new IDumpVersion[] { version });
 
@@ -111,30 +121,28 @@ private void processInputDump() throws IOException
     }
 
     /**
-     * Parses either {@code pages-articles.xml} or {@code pages-meta-current.xml}.
-     * If both files exist in the input directory {@code pages-meta-current.xml} will be favored.
+     * Selects the input articles dump in preferred order: {@code pages-meta-current} (when
+     * available — includes discussions) falls back to {@code pages-articles}. Returns every
+     * part of the selected role in ascending page-range order; a single-file dump yields a
+     * list of size 1.
      *
-     * @return the input articles dump
+     * @return the ordered list of input articles dump parts
+     * @throws IOException If neither dump role is present.
      */
-    private String getPagesArticlesFile()
+    private List<String> getPagesArticlesFiles() throws IOException
     {
-        String pagesArticlesFile = null;
-        String parseMessage = null;
-
-        // Use of minimal dump only with articles
-        if (files.getInputPagesArticles() != null) {
-            pagesArticlesFile = files.getInputPagesArticles();
-            parseMessage = "Discussions are unavailable";
+        final List<String> metaCurrent = files.getInputPagesMetaCurrentFiles();
+        if (!metaCurrent.isEmpty()) {
+            logger.log("Discussions are available");
+            return metaCurrent;
         }
-
-        // Use of dump with discussions
-        if (files.getInputPagesMetaCurrent() != null) {
-            pagesArticlesFile = files.getInputPagesMetaCurrent();
-            parseMessage = "Discussions are available";
+        final List<String> articles = files.getInputPagesArticlesFiles();
+        if (!articles.isEmpty()) {
+            logger.log("Discussions are unavailable");
+            return articles;
         }
-
-        logger.log(parseMessage);
-        return pagesArticlesFile;
+        throw new IOException("No pages-articles or pages-meta-current dump found in the input "
+                + "directory.");
     }
 
     private PageParser createPageParser() throws IOException

diff --git a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/XML2Binary.java b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/XML2Binary.java
@@ -19,10 +19,13 @@
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.List;
 
 import org.dkpro.jwpl.datamachine.domain.DataMachineFiles;
+import org.dkpro.jwpl.mwdumper.importer.DumpWriter;
 import org.dkpro.jwpl.mwdumper.importer.NamespaceFilter;
 import org.dkpro.jwpl.mwdumper.importer.XmlDumpReader;
+import org.dkpro.jwpl.wikimachine.dump.xml.MultiPartXmlDumpReader;
 
 /**
  * Use org.mediawiki.importer engine to parse the XML-Dump (only useful fields) and store it to
@@ -51,16 +54,36 @@ public class XML2Binary
      */
     public XML2Binary(InputStream iStream, DataMachineFiles files) throws IOException
     {
+        final DumpWriter writer = new NamespaceFilter(new SimpleBinaryDumpWriter(files),
+                ENABLED_NAMESPACES);
         if (USE_MODIFIED_PARSER) {
             // modified parser, skips faulty tags
-            new SimpleXmlDumpReader(iStream,
-                    new NamespaceFilter(new SimpleBinaryDumpWriter(files), ENABLED_NAMESPACES)).readDump();
+            new SimpleXmlDumpReader(iStream, writer).readDump();
         }
         else {
             // original MWDumper parser, very sensible to not closed tags
-            new XmlDumpReader(iStream,
-                    new NamespaceFilter(new SimpleBinaryDumpWriter(files), ENABLED_NAMESPACES)).readDump();
+            new XmlDumpReader(iStream, writer).readDump();
         }
     }
 
+    /**
+     * Instantiates an {@link XML2Binary} for a multi-part Wikipedia XML dump. Every stream in
+     * {@code iStreams} must be a self-contained XML document with its own {@code <mediawiki>}
+     * root; events across parts are collapsed into a single logical document by the underlying
+     * {@link MultiPartXmlDumpReader}.
+     *
+     * @param iStreams Ordered list of XML part streams (ascending page-range). Must not be
+     *                 {@code null} or empty; must not contain {@code null} elements.
+     * @param files    The {@link DataMachineFiles} configuration to apply.
+     * @throws IOException Thrown if IO errors occurred during processing.
+     */
+    public XML2Binary(List<InputStream> iStreams, DataMachineFiles files) throws IOException
+    {
+        final DumpWriter writer = new NamespaceFilter(new SimpleBinaryDumpWriter(files),
+                ENABLED_NAMESPACES);
+        // The modified parser is always used for multi-part — the original XmlDumpReader is
+        // only kept as a fallback for its stricter single-document parsing.
+        MultiPartXmlDumpReader.readDumps(iStreams, writer, SimpleXmlDumpReader::new);
+    }
+
 }