diff --git a/Makefile b/Makefile index 9fc5a78..8a0f670 100644 --- a/Makefile +++ b/Makefile @@ -44,6 +44,12 @@ duck_ccf_local_files: build @echo "warning! only works on Common Crawl Foundation's development machine" mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.Duck -Dexec.args="ccf_local_files" +duck_local_files: build +ifndef LOCAL_DIR + $(error LOCAL_DIR is required. Usage: make duck_local_files LOCAL_DIR=/path/to/data) +endif + mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.Duck -Dexec.args="local_files $(LOCAL_DIR)" + duck_cloudfront: build @echo "warning! this might take 1-10 minutes" mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.Duck -Dexec.args="cloudfront" diff --git a/README.md b/README.md index a1d1ab3..7744c74 100644 --- a/README.md +++ b/README.md @@ -791,11 +791,50 @@ The program then writes that one record into a local Parquet file, does a second ### Bonus: download a full crawl index and query with DuckDB If you want to run many of these queries, and you have a lot of disk space, you'll want to download the 300 gigabyte index and query it repeatedly. Run +All of these scripts run the same SQL query and should return the same record (written as a parquet file). + +```shell +mkdir -p 'crawl=CC-MAIN-2024-22/subset=warc' +aws s3 sync s3://commoncrawl/cc-index/table/cc-main/warc/crawl=CC-MAIN-2024-22/subset=warc/ 'crawl=CC-MAIN-2024-22/subset=warc' +``` + +> [!IMPORTANT] +> If you happen to be using the Common Crawl Foundation development server, we've already downloaded these files, and you can run ```make duck_ccf_local_files``` + +If, by any other chance, you don't have access through the AWS CLI: + +```shell +mkdir -p 'crawl=CC-MAIN-2024-22/subset=warc' +cd 'crawl=CC-MAIN-2024-22/subset=warc' + +wget https://data.commoncrawl.org/crawl-data/CC-MAIN-2024-22/cc-index-table.paths.gz +gunzip cc-index-table.paths.gz + +grep 'subset=warc' cc-index-table.paths | \ + awk '{print "https://data.commoncrawl.org/" $1, $1}' | \ + xargs -n 2 -P 10 sh -c ' + echo "Downloading: $2" + mkdir -p "$(dirname "$2")" && + wget -O "$2" "$1" + ' _ + +rm cc-index-table.paths +cd - +``` +The structure should be something like this: ```shell -aws s3 sync s3://commoncrawl/cc-index/table/cc-main/warc/crawl=CC-MAIN-2024-22/subset=warc/ .' +tree my_data +my_data +└── crawl=CC-MAIN-2024-22 + └── subset=warc + ├── part-00000-4dd72944-e9c0-41a1-9026-dfd2d0615bf2.c000.gz.parquet + ├── part-00001-4dd72944-e9c0-41a1-9026-dfd2d0615bf2.c000.gz.parquet + ├── part-00002-4dd72944-e9c0-41a1-9026-dfd2d0615bf2.c000.gz.parquet ``` +Then, you can run `make duck_local_files LOCAL_DIR=/path/to/the/downloaded/data` to run the same query as above, but this time using your local copy of the index files. + > [!IMPORTANT] > If you happen to be using the Common Crawl Foundation development server, we've already downloaded these files, and you can run ```make duck_ccf_local_files``` @@ -821,7 +860,7 @@ We make more datasets available than just the ones discussed in this Whirlwind T Common Crawl regularly releases Web Graphs which are graphs describing the structure and connectivity of the web as captured in the crawl releases. We provide two levels of graph: host-level and domain-level. Both are available to download [from our website](https://commoncrawl.org/web-graphs). -The host-level graph describes links between pages on the web at the level of hostnames (e.g. `en.wikipedia.org`). The domain-level graph aggregates this information in the host-level graph, describing links at the pay-level domain (PLD) level (based on the public suffix list maintained on [publicsuffix.org](publicsuffix.org)). The PLD is the subdomain directly under the top-level domain (TLD): e.g. for `en.wikipedia.org`, the TLD would be `.org` and the PLD would be `wikipedia.org`. +The host-level graph describes links between pages on the web at the level of hostnames (e.g. `en.wikipedia.org`). The domain-level graph aggregates this information in the host-level graph, describing links at the pay-level domain (PLD) level (based on the public suffix list maintained on [publicsuffix.org](https://publicsuffix.org)). The PLD is the subdomain directly under the top-level domain (TLD): e.g. for `en.wikipedia.org`, the TLD would be `.org` and the PLD would be `wikipedia.org`. As an example, let's look at the [Web Graph release for March, April and May 2025](https://data.commoncrawl.org/projects/hyperlinkgraph/cc-main-2025-mar-apr-may/index.html). This page provides links to download data associated with the host- and domain-level graph for those months. The key files needed to construct the graphs are the files containing the vertices or nodes (the hosts or domains), and the files containing the edges (the links between the hosts/domains). These are currently the top two links in each of the tables. diff --git a/src/main/java/org/commoncrawl/whirlwind/Duck.java b/src/main/java/org/commoncrawl/whirlwind/Duck.java index 82350ab..ca87ec8 100644 --- a/src/main/java/org/commoncrawl/whirlwind/Duck.java +++ b/src/main/java/org/commoncrawl/whirlwind/Duck.java @@ -39,7 +39,7 @@ public class Duck { private static final DateTimeFormatter TIMESTAMP_FORMATTER = DateTimeFormatter.ofPattern("yyyyMMddHHmmss"); public enum Algorithm { - CCF_LOCAL_FILES("ccf_local_files"), CLOUDFRONT("cloudfront"); + CCF_LOCAL_FILES("ccf_local_files"), CLOUDFRONT("cloudfront"), LOCAL_FILES("local_files"); private final String name; @@ -114,8 +114,13 @@ public static void printRowAsKvList(ResultSet rs, PrintStream out) throws SQLExc /** * Gets the list of parquet files to query based on the algorithm. */ - public static List getFiles(Algorithm algo, String crawl) throws IOException { + public static List getFiles(Algorithm algo, String crawl, String localPrefix) throws IOException { switch (algo) { + case LOCAL_FILES: { + Path indexPath = Path.of(localPrefix); + return getLocalParquetFiles(indexPath); + } + case CCF_LOCAL_FILES: { Path indexPath = Path.of("/home/cc-pds/commoncrawl/cc-index/table/cc-main/warc", "crawl=" + crawl, "subset=warc"); @@ -143,6 +148,23 @@ public static List getFiles(Algorithm algo, String crawl) throws IOExcep } } + private static List getLocalParquetFiles(Path indexPath) throws IOException { + if (!Files.isDirectory(indexPath)) { + System.err.println("Directory not found: " + indexPath); + System.exit(1); + } + + List files = Files.walk(indexPath).map(Path::toString).filter(string -> string.endsWith(".parquet")) + .collect(Collectors.toList()); + + if (files.isEmpty()) { + System.err.println("No parquet files found in: " + indexPath); + System.exit(1); + } + + return files; + } + private static List getLocalParquetFiles(Path indexPath, String prefix, String crawl) throws IOException { if (!Files.isDirectory(indexPath)) { printIndexDownloadAdvice(prefix, crawl); @@ -190,6 +212,7 @@ private static ResultSet executeWithRetry(Statement stmt, String sql) throws SQL public static void main(String[] args) { String crawl = "CC-MAIN-2024-22"; Algorithm algo = Algorithm.CLOUDFRONT; + String localPrefix = "/home/cc-pds/commoncrawl/cc-index/table/cc-main/warc"; if (args.length > 0) { if ("help".equalsIgnoreCase(args[0]) || "--help".equals(args[0]) || "-h".equals(args[0])) { @@ -201,8 +224,17 @@ public static void main(String[] args) { System.out.println("Using algorithm: " + algo.getName()); } + if (algo == Algorithm.LOCAL_FILES) { + if (args.length < 2) { + System.err.println("Error: local_files algorithm requires a directory argument."); + printUsage(); + System.exit(1); + } + localPrefix = args[1]; + } + try { - run(algo, crawl); + run(algo, crawl, localPrefix); } catch (Exception e) { System.err.println("Error: " + e.getMessage()); printUsage(); @@ -210,11 +242,12 @@ public static void main(String[] args) { } } - public static void run(Algorithm algo, String crawl) throws IOException, SQLException, InterruptedException { + public static void run(Algorithm algo, String crawl, String localPrefix) + throws IOException, SQLException, InterruptedException { // Ensure stdout uses UTF-8 PrintStream out = new PrintStream(System.out, true, StandardCharsets.UTF_8); - List files = getFiles(algo, crawl); + List files = getFiles(algo, crawl, localPrefix); String filesList = files.stream().map(f -> "'" + f + "'").collect(Collectors.joining(", ")); // Use in-memory DuckDB @@ -230,15 +263,16 @@ public static void run(Algorithm algo, String crawl) throws IOException, SQLExce // Count total records out.printf("Total records for crawl: %s%n", crawl); - try (ResultSet rs = executeWithRetry(stmt, "SELECT COUNT(*) as cnt FROM ccindex")) { + try (ResultSet rs = executeWithRetry(stmt, + "SELECT COUNT(*) as cnt FROM ccindex " + "WHERE subset = 'warc' AND crawl = '" + crawl + "'")) { if (rs.next()) { out.println(rs.getLong("cnt")); } } // Query for our specific row - String selectQuery = "" + "SELECT * FROM ccindex WHERE subset = 'warc' " + "AND crawl = 'CC-MAIN-2024-22' " - + "AND url_host_tld = 'org' " + "AND url_host_registered_domain = 'wikipedia.org' " + String selectQuery = "SELECT * FROM ccindex WHERE subset = 'warc' AND crawl = '" + crawl + "' " + + "AND url_host_tld = 'org' AND url_host_registered_domain = 'wikipedia.org' " + "AND url = 'https://an.wikipedia.org/wiki/Escopete'"; out.println("Our one row:"); @@ -305,14 +339,19 @@ private static void printResultSet(ResultSet rs, PrintStream out) throws SQLExce } private static void printUsage() { - System.err.println("Usage: Duck [algorithm]"); + System.err.println("Usage: Duck [algorithm] [local-directory]"); System.err.println(); System.err.println("Query Common Crawl index using DuckDB."); System.err.println(); System.err.println("Algorithms:"); - System.err.println(" ccf_local_files Use local parquet files from /home/cc-pds/commoncrawl/..."); + System.err.println(" local_files Use local parquet files (from specified local directory)"); + System.err.println( + " ccf_local_files Use local parquet files (default: /home/cc-pds/commoncrawl/cc-index/table/cc-main/warc)"); System.err.println(" cloudfront Use CloudFront URLs (requires .warc.paths.gz file)"); System.err.println(); + System.err.println("Arguments:"); + System.err.println(" local-directory Local directory prefix for 'local_files' algorithm"); + System.err.println(); System.err.println("Options:"); System.err.println(" help, --help, -h Show this help message"); }