diff --git a/Makefile b/Makefile index 8a0f670..142c8e3 100644 --- a/Makefile +++ b/Makefile @@ -56,7 +56,7 @@ duck_cloudfront: build jwarc.jar: @echo "downloading JWarc JAR" - curl -fL -o jwarc.jar https://github.com/iipc/jwarc/releases/download/v0.33.0/jwarc-0.33.0.jar + curl -fL -o jwarc.jar https://github.com/iipc/jwarc/releases/download/v0.35.0/jwarc-0.35.0.jar wreck_the_warc: build jwarc.jar @echo diff --git a/README.md b/README.md index fac7601..50256fc 100644 --- a/README.md +++ b/README.md @@ -92,7 +92,7 @@ Now that we've looked at the uncompressed versions of these files to understand The [JWarc](https://github.com/iipc/jwarc) Java library lets us read and write WARC files both programmatically and via a CLI. -You should download the [JWarc](https://github.com/iipc/jwarc)'s JAR using `make get_jwarc` which should download the JAR in the root directory. +You should download the [JWarc](https://github.com/iipc/jwarc)'s JAR using `make jwarc.jar` which should download the JAR in the root directory. If you download it yourself, we recommend you to rename it to remove the version from the jar filename, so you can copy-paste the commands directly. You can now explore the CLI commands available by running: @@ -434,7 +434,7 @@ We can create our own CDXJ index from the local WARCs by running: ```make cdxj``` -This uses the JWARC library and, partially, a home-cooked code that we wrote to support WET and WAT records, to generate CDXJ index files for our WARC files by running the code below: +This uses the JWARC library to generate CDXJ index files for our WARC files by running the code below:
Click to view code @@ -442,8 +442,8 @@ This uses the JWARC library and, partially, a home-cooked code that we wrote to ``` creating *.cdxj index files from the local warcs java -jar jwarc.jar cdxj data/whirlwind.warc.gz > whirlwind.warc.cdxj -mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wet.gz --records conversion" > whirlwind.warc.wet.cdxj -mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wat.gz --records metadata" > whirlwind.warc.wat.cdxj +java -jar jwarc.jar cdxj data/whirlwind.warc.wet.gz --record-type conversion > whirlwind.warc.wet.cdxj +java -jar jwarc.jar cdxj data/whirlwind.warc.wat.gz --record-type metadata > whirlwind.warc.wat.cdxj ```
diff --git a/src/main/java/org/commoncrawl/whirlwind/CdxWriterWithDynamicFiltering.java b/src/main/java/org/commoncrawl/whirlwind/CdxWriterWithDynamicFiltering.java deleted file mode 100644 index 75f7c5a..0000000 --- a/src/main/java/org/commoncrawl/whirlwind/CdxWriterWithDynamicFiltering.java +++ /dev/null @@ -1,275 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.commoncrawl.whirlwind; - -import org.netpreserve.jwarc.*; -import org.netpreserve.jwarc.cdx.CdxFormat; -import org.netpreserve.jwarc.cdx.CdxRequestEncoder; -import org.netpreserve.jwarc.cdx.CdxWriter; - -import java.io.IOException; -import java.io.Writer; -import java.net.URI; -import java.time.ZoneOffset; -import java.time.format.DateTimeFormatter; -import java.util.function.Consumer; -import java.util.function.Predicate; - -public class CdxWriterWithDynamicFiltering extends CdxWriter { - private static final DateTimeFormatter DATE_FORMAT = DateTimeFormatter.ofPattern("yyyyMMddHHmmss") - .withZone(ZoneOffset.UTC); - - private final Writer writer; - private CdxFormat format = CdxFormat.CDXJ; - private boolean postAppend = false; - private Consumer warningHandler; - private Predicate recordFilter = null; - - public CdxWriterWithDynamicFiltering(Writer writer) { - super(writer); - this.writer = writer; - } - - @Override - public void setFormat(CdxFormat format) { - super.setFormat(format); - this.format = format; - } - - public CdxFormat getFormat() { - return this.format; - } - - @Override - public void setPostAppend(boolean postAppend) { - super.setPostAppend(postAppend); - this.postAppend = postAppend; - } - - @Override - public void onWarning(Consumer warningHandler) { - super.onWarning(warningHandler); - this.warningHandler = warningHandler; - } - - @Override - public void process(WarcReader reader, String filename) throws IOException { - - if (recordFilter == null) { - super.process(reader, filename); - return; - } - - // Custom processing for filtered record types, since we are filtering, we get - // and process - // every record here. - WarcRecord record = reader.next().orElse(null); - while (record != null) { - try { - String recordType = record.type().toLowerCase(); - - long position = reader.position(); - - // Handle WarcCaptureRecord types (response, resource, revisit, request) - if (record instanceof WarcCaptureRecord) { - WarcCaptureRecord capture = (WarcCaptureRecord) record; - URI id = record.version().getProtocol().equals("ARC") ? null : record.id(); - - // Ensure HTTP header is parsed for revisit records - if (record instanceof WarcRevisit && record.contentType().base().equals(MediaType.HTTP)) { - ((WarcRevisit) record).http(); - } - - // Advance to next record to calculate length - record = reader.next().orElse(null); - long length = reader.position() - position; - - // Skip records without a date - if (!capture.headers().first("WARC-Date").isPresent()) { - emitWarning(filename, position, "Skipping record due to missing or invalid date"); - continue; - } - - String encodedRequest = null; - if (postAppend) { - while (encodedRequest == null && record instanceof WarcCaptureRecord - && ((WarcCaptureRecord) record).concurrentTo().contains(id)) { - if (record instanceof WarcRequest) { - HttpRequest httpRequest = ((WarcRequest) record).http(); - encodedRequest = CdxRequestEncoder.encode(httpRequest); - } - record = reader.next().orElse(null); - } - } - - write(capture, filename, position, length, encodedRequest); - } - // Handle WarcConversion (from WET files) and other WarcTargetRecord types - else if (record instanceof WarcTargetRecord) { - WarcTargetRecord targetRecord = (WarcTargetRecord) record; - - // Advance to next record to calculate length - record = reader.next().orElse(null); - long length = reader.position() - position; - - // Skip records without a date - if (!targetRecord.headers().first("WARC-Date").isPresent()) { - emitWarning(filename, position, "Skipping record due to missing or invalid date"); - continue; - } - - writeTargetRecord(targetRecord, filename, position, length); - } else { - // Skip non-target records (like warcinfo) - record = reader.next().orElse(null); - } - } catch (ParsingException e) { - emitWarning(filename, reader.position(), "ParsingException: " + e.getBaseMessage()); - record = reader.next().orElse(null); - } - } - } - - @Override - public void setRecordFilter(Predicate recordFilter) { - super.setRecordFilter(recordFilter); - this.recordFilter = recordFilter; - } - - /** - * Writes a CDXJ record for a WarcTargetRecord (like WarcConversion from WET - * files). - * - * TODO: make it more generic and integrated into jwarc - */ - private void writeTargetRecord(WarcTargetRecord record, String filename, long position, long length) - throws IOException { - String target = record.target(); - if (target == null) { - emitWarning(filename, position, "Skipping record due to missing target URI"); - return; - } - - // Build CDXJ line: surt timestamp {json} - StringBuilder line = new StringBuilder(); - - // SURT-formatted URL key - String surt = URIs.toNormalizedSurt(target); - line.append(escape(surt)); - line.append(' '); - - // Timestamp - String timestamp = DATE_FORMAT.format(record.date()); - line.append(timestamp); - line.append(' '); - - // JSON block - line.append('{'); - - // URL - line.append("\"url\": \""); - escapeJsonString(line, target); - line.append("\""); - - // MIME type - try { - if (record.payload().isPresent()) { - MediaType mime = record.payload().get().type(); - if (mime != null) { - line.append(", \"mime\": \""); - escapeJsonString(line, mime.base().toString()); - line.append("\""); - } - } - } catch (IOException e) { - // Skip mime if payload can't be read - } - - // Digest - record.payloadDigest().ifPresent(digest -> { - line.append(", \"digest\": \""); - escapeJsonString(line, digest.raw()); - line.append("\""); - }); - - // Filename - if (filename != null) { - line.append(", \"filename\": \""); - escapeJsonString(line, filename); - line.append("\""); - } - - // Offset - line.append(", \"offset\": \""); - line.append(position); - line.append("\""); - - // Length - line.append(", \"length\": \""); - line.append(length); - line.append("\""); - - line.append('}'); - - writer.write(line.toString()); - writer.write('\n'); - } - - private void emitWarning(String filename, long position, String message) { - if (warningHandler == null) - return; - warningHandler.accept(filename + " (offset " + position + ") " + message); - } - - // Borrowed from org.netpreserve.jwarc.cdx.CdxWriter - // TODO: remove duplication - private static String escape(String str) { - if (str == null) - return null; - return str.replace(" ", "%20").replace("\n", "%0A").replace("\0", "%00"); - } - - // Borrowed from org.netpreserve.jwarc.cdx.CdxWriter - // TODO: remove duplication - private static void escapeJsonString(StringBuilder out, String value) { - for (int i = 0; i < value.length(); i++) { - char c = value.charAt(i); - if (c == '"') - out.append("\\\""); - else if (c == '\\') - out.append("\\\\"); - else if (c == '\b') - out.append("\\b"); - else if (c == '\f') - out.append("\\f"); - else if (c == '\n') - out.append("\\n"); - else if (c == '\r') - out.append("\\r"); - else if (c == '\t') - out.append("\\t"); - else if (c <= 0x1f) { - out.append("\\u00"); - out.append(Character.forDigit((c & 0xf0) >>> 4, 16)); - out.append(Character.forDigit(c & 0xf, 16)); - } else { - out.append(c); - } - } - } -} diff --git a/src/main/java/org/commoncrawl/whirlwind/CdxjIndexer.java b/src/main/java/org/commoncrawl/whirlwind/CdxjIndexer.java deleted file mode 100644 index b5c0d6e..0000000 --- a/src/main/java/org/commoncrawl/whirlwind/CdxjIndexer.java +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.commoncrawl.whirlwind; - -import org.apache.commons.lang3.StringUtils; -import org.netpreserve.jwarc.WarcReader; -import org.netpreserve.jwarc.cdx.CdxFormat; - -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStreamWriter; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashSet; -import java.util.Set; - -public class CdxjIndexer { - - public static void main(String[] args) throws IOException { - String inputFile = null; - Set recordTypes = null; - - for (int i = 0; i < args.length; i++) { - if (args[i].equals("--records") && i + 1 < args.length) { - // Support comma-separated record types: --records conversion,metadata - String typesArg = args[++i]; - recordTypes = new HashSet<>(Arrays.asList(typesArg.split(","))); - } else if (args[i].equals("--help") || args[i].equals("-h")) { - printUsage(); - System.exit(0); - } else if (!args[i].startsWith("-")) { - inputFile = args[i]; - } else { - System.err.println("Unknown option: " + args[i]); - printUsage(); - System.exit(1); - } - } - - if (inputFile == null) { - System.err.println("Error: Input file is required"); - printUsage(); - System.exit(1); - } - - Path requested = Path.of(inputFile).toAbsolutePath().normalize(); - if (!Files.isRegularFile(requested)) { - throw new SecurityException("Invalid WARC path: " + requested); - } - - // TODO: Move this into the WarcReader or the process of iterating over the - // records - if (requested.toString().endsWith("gz") || requested.toString().endsWith("gzip")) { - try { - ValidateWARC.validateRandomAccessWarcOrFail(requested); - } catch (IOException e) { - System.err.println("This file is probably not a multi-member gzip but a single gzip file.\n" - + "To allow seek, a gzipped WARC must have each record compressed into a single gzip member and concatenated together.\n\n" - + "This file is likely still valid and can be fixed by running:\n" - + "mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.RecompressWARC -Dexec.args=\"" - + inputFile + " " + inputFile.replace(".gz", ".recompressed.gz") + "\""); - System.exit(-1); - } - } - - try (InputStream in = Files.newInputStream(requested); - CdxWriterWithDynamicFiltering cdxjWriter = new CdxWriterWithDynamicFiltering( - new OutputStreamWriter(System.out)); - WarcReader reader = new WarcReader(in)) { - reader.setLenient(true); - cdxjWriter.setFormat(CdxFormat.CDXJ); - if (recordTypes != null) { - Set unmodifiableRecordTypes = Collections.unmodifiableSet(recordTypes); - cdxjWriter.setRecordFilter( - record -> unmodifiableRecordTypes.contains(StringUtils.lowerCase(record.type()))); - } - cdxjWriter.process(reader, requested.toString()); - } - } - - private static void printUsage() { - System.err.println("Usage: cdxj-indexer [OPTIONS] "); - System.err.println(); - System.err.println("Options:"); - System.err.println(" --records Comma-separated list of record types to index"); - System.err.println(" (e.g., conversion, response, metadata)"); - System.err.println(" --help, -h Show this help message"); - System.err.println(); - System.err.println("Examples:"); - System.err.println(" cdxj-indexer file.warc.gz"); - System.err.println(" cdxj-indexer --records conversion file.wet.gz"); - System.err.println(" cdxj-indexer --records response,resource file.warc.gz"); - } -}