Skip to content
Draft
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,25 @@

import java.io.File;
import java.io.FileFilter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Set;

import org.dkpro.jwpl.wikimachine.debug.ILogger;
import org.dkpro.jwpl.wikimachine.domain.Files;
import org.dkpro.jwpl.wikimachine.util.DumpFileDiscovery;

/**
* A {@link Files} implementation specific for the DataMachine tool.
* It defines file name constants and provides methods for
* input/output directory building rules and checks.
* <p>
* Wikimedia publishes large XML dumps split across several files (see
* {@link DumpFileDiscovery}). For the {@code pages-articles} and {@code pages-meta-current}
* roles this class keeps the ordered list of parts and exposes both the legacy singular
* getter (first part of the ordered list, for backwards compatibility) and a list getter
* that returns every part.
*
* @see Files
*/
Expand All @@ -35,8 +46,8 @@ public class DataMachineFiles
{
private final static String INPUT_PAGELINKS = "pagelinks.sql";
private final static String INPUT_CATEGORYLINKS = "categorylinks.sql";
private final static String INPUT_PAGESARTICLES = "pages-articles.xml";
private final static String INPUT_PAGESMETACURRENT = "pages-meta-current.xml";
private final static String INPUT_PAGESARTICLES = "pages-articles";
private final static String INPUT_PAGESMETACURRENT = "pages-meta-current";

private final static String GENERATED_PAGE = "page.bin";
private final static String GENERATED_REVISION = "revision.bin";
Expand All @@ -48,13 +59,15 @@ public class DataMachineFiles

private final static String ARCHIVE_EXTENSION = ".gz";

private final static Set<String> SUPPORTED_EXTENSIONS = Set.of("bz2", "gz", "7z");

private File dataDirectory = new File(".");
private boolean compressGeneratedFiles = false;

private File inputPagelinks = null;
private File inputPagesarticles = null;
private File inputCategorylinks = null;
private File inputPagesMetaCurrent = null;
private List<File> inputPagesarticles = new ArrayList<>();
private List<File> inputPagesMetaCurrent = new ArrayList<>();

/**
* Instantiates a {@link Files} object with the specified {@code logger}.
Expand All @@ -77,9 +90,9 @@ public DataMachineFiles(DataMachineFiles files)
super(files);
this.dataDirectory = files.dataDirectory;
this.inputPagelinks = files.inputPagelinks;
this.inputPagesarticles = files.inputPagesarticles;
this.inputPagesarticles = new ArrayList<>(files.inputPagesarticles);
this.inputCategorylinks = files.inputCategorylinks;
this.inputPagesMetaCurrent = files.inputPagesMetaCurrent;
this.inputPagesMetaCurrent = new ArrayList<>(files.inputPagesMetaCurrent);
this.compressGeneratedFiles = files.compressGeneratedFiles;
}

Expand Down Expand Up @@ -108,30 +121,34 @@ private boolean checkDataMachineSourceFiles()
{
final FileFilter supportedFormatFilter = file -> {
final String name = file.getName();
// See UniversalDecompressor for all built-in decompression formats. For now:
return name.endsWith(".7z") || name.endsWith(".gz") || name.endsWith(".bz2");
};
final File[] files = dataDirectory.listFiles(supportedFormatFilter);
if (files != null && files.length > 2) {
if (files != null && files.length >= 3) {
final List<File> articleParts = new ArrayList<>();
final List<File> metaCurrentParts = new ArrayList<>();
for (File currentFile : files) {
String currentFileName = currentFile.getName();
if (currentFileName.contains(INPUT_PAGESARTICLES)) {
inputPagesarticles = currentFile;
final String name = currentFile.getName();
if (DumpFileDiscovery.matchesRole(name, INPUT_PAGESARTICLES, SUPPORTED_EXTENSIONS)) {
articleParts.add(currentFile);
}
else if (DumpFileDiscovery.matchesRole(name, INPUT_PAGESMETACURRENT,
SUPPORTED_EXTENSIONS)) {
metaCurrentParts.add(currentFile);
}
else if (currentFileName.contains(INPUT_PAGELINKS)) {
else if (name.contains(INPUT_PAGELINKS)) {
inputPagelinks = currentFile;
}
else if (currentFileName.contains(INPUT_CATEGORYLINKS)) {
else if (name.contains(INPUT_CATEGORYLINKS)) {
inputCategorylinks = currentFile;
}
else if (currentFileName.contains(INPUT_PAGESMETACURRENT)) {
inputPagesMetaCurrent = currentFile;
}
}
inputPagesarticles = DumpFileDiscovery.orderByPageRange(articleParts);
inputPagesMetaCurrent = DumpFileDiscovery.orderByPageRange(metaCurrentParts);
}
// either inputPagesarticles or inputPagesMetaCurrent have to be placed
// in the input directory
return !((inputPagesarticles == null && inputPagesMetaCurrent == null)
return !((inputPagesarticles.isEmpty() && inputPagesMetaCurrent.isEmpty())
|| inputPagelinks == null || inputCategorylinks == null);
}

Expand Down Expand Up @@ -179,14 +196,29 @@ public String getInputPageLinks()
}

/**
* @return Retrieves the absolute path of the {@code pages-articles.xml} file.
* @return Retrieves the absolute path of the first {@code pages-articles.xml} part,
* or {@code null} if none was discovered. For multi-part dumps, prefer
* {@link #getInputPagesArticlesFiles()}.
*/
public String getInputPagesArticles()
{
if (inputPagesarticles == null) {
if (inputPagesarticles.isEmpty()) {
checkDataMachineSourceFiles();
}
return inputPagesarticles != null ? inputPagesarticles.getAbsolutePath() : null;
return inputPagesarticles.isEmpty() ? null : inputPagesarticles.get(0).getAbsolutePath();
}

/**
* @return Absolute paths of all {@code pages-articles.xml} parts ordered by ascending page
* range. Empty if the dump is not available. A single-file dump yields a list of
* size 1.
*/
public List<String> getInputPagesArticlesFiles()
{
if (inputPagesarticles.isEmpty()) {
checkDataMachineSourceFiles();
}
return toAbsolutePathList(inputPagesarticles);
}

/**
Expand All @@ -201,14 +233,41 @@ public String getInputCategoryLinks()
}

/**
* @return Retrieves the absolute path of the {@code pages-meta-current.xml} file.
* @return Retrieves the absolute path of the first {@code pages-meta-current.xml} part,
* or {@code null} if none was discovered. For multi-part dumps, prefer
* {@link #getInputPagesMetaCurrentFiles()}.
*/
public String getInputPagesMetaCurrent()
{
if (inputPagesMetaCurrent == null) {
if (inputPagesMetaCurrent.isEmpty()) {
checkDataMachineSourceFiles();
}
return inputPagesMetaCurrent.isEmpty() ? null
: inputPagesMetaCurrent.get(0).getAbsolutePath();
}

/**
* @return Absolute paths of all {@code pages-meta-current.xml} parts ordered by ascending
* page range. Empty if the dump is not available.
*/
public List<String> getInputPagesMetaCurrentFiles()
{
if (inputPagesMetaCurrent.isEmpty()) {
checkDataMachineSourceFiles();
}
return inputPagesMetaCurrent != null ? inputPagesMetaCurrent.getAbsolutePath() : null;
return toAbsolutePathList(inputPagesMetaCurrent);
}

private static List<String> toAbsolutePathList(List<File> files)
{
if (files.isEmpty()) {
return Collections.emptyList();
}
final List<String> paths = new ArrayList<>(files.size());
for (File f : files) {
paths.add(f.getAbsolutePath());
}
return paths;
}

private String getGeneratedPath(String fileName)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
package org.dkpro.jwpl.datamachine.domain;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;

import org.dkpro.jwpl.datamachine.dump.xml.XML2Binary;
import org.dkpro.jwpl.wikimachine.domain.AbstractSnapshotGenerator;
Expand Down Expand Up @@ -85,7 +88,14 @@ private void processInputDump() throws IOException
{

logger.log("Parsing input dumps...");
new XML2Binary(decompressor.getInputStream(getPagesArticlesFile()), files);
final List<String> parts = getPagesArticlesFiles();
final List<InputStream> streams = new ArrayList<>(parts.size());
for (String part : parts) {
streams.add(decompressor.getInputStream(part));
}
// A single-file dump reduces to a one-element list; the multi-part XML2Binary
// constructor handles both cases uniformly via MultiPartXmlDumpReader.
new XML2Binary(streams, files);

dumpVersionProcessor.setDumpVersions(new IDumpVersion[] { version });

Expand All @@ -111,30 +121,28 @@ private void processInputDump() throws IOException
}

/**
* Parses either {@code pages-articles.xml} or {@code pages-meta-current.xml}.
* If both files exist in the input directory {@code pages-meta-current.xml} will be favored.
* Selects the input articles dump in preferred order: {@code pages-meta-current} (when
* available — includes discussions) falls back to {@code pages-articles}. Returns every
* part of the selected role in ascending page-range order; a single-file dump yields a
* list of size 1.
*
* @return the input articles dump
* @return the ordered list of input articles dump parts
* @throws IOException If neither dump role is present.
*/
private String getPagesArticlesFile()
private List<String> getPagesArticlesFiles() throws IOException
{
String pagesArticlesFile = null;
String parseMessage = null;

// Use of minimal dump only with articles
if (files.getInputPagesArticles() != null) {
pagesArticlesFile = files.getInputPagesArticles();
parseMessage = "Discussions are unavailable";
final List<String> metaCurrent = files.getInputPagesMetaCurrentFiles();
if (!metaCurrent.isEmpty()) {
logger.log("Discussions are available");
return metaCurrent;
}

// Use of dump with discussions
if (files.getInputPagesMetaCurrent() != null) {
pagesArticlesFile = files.getInputPagesMetaCurrent();
parseMessage = "Discussions are available";
final List<String> articles = files.getInputPagesArticlesFiles();
if (!articles.isEmpty()) {
logger.log("Discussions are unavailable");
return articles;
}

logger.log(parseMessage);
return pagesArticlesFile;
throw new IOException("No pages-articles or pages-meta-current dump found in the input "
+ "directory.");
}

private PageParser createPageParser() throws IOException
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,13 @@

import java.io.IOException;
import java.io.InputStream;
import java.util.List;

import org.dkpro.jwpl.datamachine.domain.DataMachineFiles;
import org.dkpro.jwpl.mwdumper.importer.DumpWriter;
import org.dkpro.jwpl.mwdumper.importer.NamespaceFilter;
import org.dkpro.jwpl.mwdumper.importer.XmlDumpReader;
import org.dkpro.jwpl.wikimachine.dump.xml.MultiPartXmlDumpReader;

/**
* Use org.mediawiki.importer engine to parse the XML-Dump (only useful fields) and store it to
Expand Down Expand Up @@ -51,16 +54,36 @@ public class XML2Binary
*/
public XML2Binary(InputStream iStream, DataMachineFiles files) throws IOException
{
final DumpWriter writer = new NamespaceFilter(new SimpleBinaryDumpWriter(files),
ENABLED_NAMESPACES);
if (USE_MODIFIED_PARSER) {
// modified parser, skips faulty tags
new SimpleXmlDumpReader(iStream,
new NamespaceFilter(new SimpleBinaryDumpWriter(files), ENABLED_NAMESPACES)).readDump();
new SimpleXmlDumpReader(iStream, writer).readDump();
}
else {
// original MWDumper parser, very sensible to not closed tags
new XmlDumpReader(iStream,
new NamespaceFilter(new SimpleBinaryDumpWriter(files), ENABLED_NAMESPACES)).readDump();
new XmlDumpReader(iStream, writer).readDump();
}
}

/**
* Instantiates an {@link XML2Binary} for a multi-part Wikipedia XML dump. Every stream in
* {@code iStreams} must be a self-contained XML document with its own {@code <mediawiki>}
* root; events across parts are collapsed into a single logical document by the underlying
* {@link MultiPartXmlDumpReader}.
*
* @param iStreams Ordered list of XML part streams (ascending page-range). Must not be
* {@code null} or empty; must not contain {@code null} elements.
* @param files The {@link DataMachineFiles} configuration to apply.
* @throws IOException Thrown if IO errors occurred during processing.
*/
public XML2Binary(List<InputStream> iStreams, DataMachineFiles files) throws IOException
{
final DumpWriter writer = new NamespaceFilter(new SimpleBinaryDumpWriter(files),
ENABLED_NAMESPACES);
// The modified parser is always used for multi-part — the original XmlDumpReader is
// only kept as a fallback for its stricter single-document parsing.
MultiPartXmlDumpReader.readDumps(iStreams, writer, SimpleXmlDumpReader::new);
}

}
Loading