From 307f6b2f0f2121d529af48679d22e14af2839aad Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 2 Feb 2026 17:08:31 +0100 Subject: [PATCH 1/2] fix: editorconfig to avoid replacing end_of_line for warc, wet and warc --- .editorconfig | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.editorconfig b/.editorconfig index 6aa3960..d9944f3 100644 --- a/.editorconfig +++ b/.editorconfig @@ -3,17 +3,22 @@ root = true [*] -end_of_line = lf insert_final_newline = true +[{*.warc, *.warc.wet, *.warc.wat}] +insert_final_newline = false + [*.java] +end_of_line = lf charset = utf-8 indent_style = space indent_size = 4 [Makefile] +end_of_line = lf indent_style = tab [*.yaml] +end_of_line = lf indent_style = space indent_size = 2 From 8791a4859b473d053033f42ed371437c55d8900a Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 5 Feb 2026 16:25:22 +0100 Subject: [PATCH 2/2] fix: revert line ending to follow the WARC format Signed-off-by: Luca Foppiano --- .gitattributes | 3 + data/whirlwind.warc | 200 ++++++++++++++++++++-------------------- data/whirlwind.warc.wat | 56 +++++------ data/whirlwind.warc.wet | 62 ++++++------- 4 files changed, 162 insertions(+), 159 deletions(-) create mode 100644 .gitattributes diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..fe56037 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,3 @@ +*.warc binary +*.warc.wet binary +*.warc.wat binary diff --git a/data/whirlwind.warc b/data/whirlwind.warc index bdf0eab..edd798f 100644 --- a/data/whirlwind.warc +++ b/data/whirlwind.warc @@ -1,85 +1,85 @@ -WARC/1.0 -WARC-Type: warcinfo -WARC-Date: 2024-05-17T23:31:22Z -WARC-Record-ID: -Content-Length: 486 -Content-Type: application/warc-fields -WARC-Filename: CC-MAIN-20240517233122-20240518023122-00000.warc.gz - -isPartOf: CC-MAIN-2024-22 -publisher: Common Crawl -description: Wide crawl of the web for May 2024 -operator: Common Crawl Admin (info@commoncrawl.org) -hostname: ip-10-67-67-211 -software: Apache Nutch 1.19 (modified, https://github.com/commoncrawl/nutch/) -robots: checked via crawler-commons 1.5-SNAPSHOT (https://github.com/crawler-commons/crawler-commons) -format: WARC File Format 1.1 -conformsTo: https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/ - - -WARC/1.0 -WARC-Type: request -WARC-Date: 2024-05-18T01:58:10Z -WARC-Record-ID: -Content-Length: 265 -Content-Type: application/http; msgtype=request -WARC-Warcinfo-ID: -WARC-IP-Address: 208.80.154.224 -WARC-Target-URI: https://an.wikipedia.org/wiki/Escopete - -GET /wiki/Escopete HTTP/1.1 -User-Agent: CCBot/2.0 (https://commoncrawl.org/faq/) -Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8 -Accept-Language: en-US,en;q=0.5 -Accept-Encoding: br,gzip -Host: an.wikipedia.org -Connection: Keep-Alive - - - -WARC/1.0 -WARC-Type: response -WARC-Date: 2024-05-18T01:58:10Z -WARC-Record-ID: -Content-Length: 74581 -Content-Type: application/http; msgtype=response -WARC-Warcinfo-ID: -WARC-Concurrent-To: -WARC-IP-Address: 208.80.154.224 -WARC-Target-URI: https://an.wikipedia.org/wiki/Escopete -WARC-Payload-Digest: sha1:RY7PLBUFQNI2FFV5FTUQK72W6SNPXLQU -WARC-Block-Digest: sha1:35FTUGFVNWRVTZQGCWIX2MQA3LMYC7X7 -WARC-Identified-Payload-Type: text/html - -HTTP/1.1 200 OK -date: Sat, 18 May 2024 01:58:10 GMT -server: mw-web.eqiad.canary-bb67b76b8-jtwdb -x-content-type-options: nosniff -content-language: an -origin-trial: AonOP4SwCrqpb0nhZbg554z9iJimP3DxUDB8V4yu9fyyepauGKD0NXqTknWi4gnuDfMG6hNb7TDUDTsl0mDw9gIAAABmeyJvcmlnaW4iOiJodHRwczovL3dpa2lwZWRpYS5vcmc6NDQzIiwiZmVhdHVyZSI6IlRvcExldmVsVHBjZCIsImV4cGlyeSI6MTczNTM0Mzk5OSwiaXNTdWJkb21haW4iOnRydWV9 -accept-ch: -vary: Accept-Encoding,Cookie,Authorization -last-modified: Sat, 04 May 2024 01:58:10 GMT -content-type: text/html; charset=UTF-8 -X-Crawler-content-encoding: gzip -age: 0 -x-cache: cp1106 miss, cp1106 miss -x-cache-status: miss -server-timing: cache;desc="miss", host;desc="cp1106" -strict-transport-security: max-age=106384710; includeSubDomains; preload -report-to: { "group": "wm_nel", "max_age": 604800, "endpoints": [{ "url": "https://intake-logging.wikimedia.org/v1/events?stream=w3c.reportingapi.network_error&schema_uri=/w3c/reportingapi/network_error/1.0.0" }] } -nel: { "report_to": "wm_nel", "max_age": 604800, "failure_fraction": 0.05, "success_fraction": 0.0} -set-cookie: WMF-Last-Access=18-May-2024;Path=/;HttpOnly;secure;Expires=Wed, 19 Jun 2024 00:00:00 GMT -set-cookie: WMF-Last-Access-Global=18-May-2024;Path=/;Domain=.wikipedia.org;HttpOnly;secure;Expires=Wed, 19 Jun 2024 00:00:00 GMT -set-cookie: WMF-DP=1a6;Path=/;HttpOnly;secure;Expires=Sat, 18 May 2024 00:00:00 GMT -x-client-ip: 34.239.158.223 -cache-control: private, s-maxage=0, max-age=0, must-revalidate -set-cookie: GeoIP=US:VA:Ashburn:39.05:-77.49:v4; Path=/; secure; Domain=.wikipedia.org -set-cookie: NetworkProbeLimit=0.001;Path=/;Secure;Max-Age=3600 -accept-ranges: bytes -X-Crawler-transfer-encoding: chunked -Content-Length: 72848 - +WARC/1.0 +WARC-Type: warcinfo +WARC-Date: 2024-05-17T23:31:22Z +WARC-Record-ID: +Content-Length: 486 +Content-Type: application/warc-fields +WARC-Filename: CC-MAIN-20240517233122-20240518023122-00000.warc.gz + +isPartOf: CC-MAIN-2024-22 +publisher: Common Crawl +description: Wide crawl of the web for May 2024 +operator: Common Crawl Admin (info@commoncrawl.org) +hostname: ip-10-67-67-211 +software: Apache Nutch 1.19 (modified, https://github.com/commoncrawl/nutch/) +robots: checked via crawler-commons 1.5-SNAPSHOT (https://github.com/crawler-commons/crawler-commons) +format: WARC File Format 1.1 +conformsTo: https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/ + + +WARC/1.0 +WARC-Type: request +WARC-Date: 2024-05-18T01:58:10Z +WARC-Record-ID: +Content-Length: 265 +Content-Type: application/http; msgtype=request +WARC-Warcinfo-ID: +WARC-IP-Address: 208.80.154.224 +WARC-Target-URI: https://an.wikipedia.org/wiki/Escopete + +GET /wiki/Escopete HTTP/1.1 +User-Agent: CCBot/2.0 (https://commoncrawl.org/faq/) +Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8 +Accept-Language: en-US,en;q=0.5 +Accept-Encoding: br,gzip +Host: an.wikipedia.org +Connection: Keep-Alive + + + +WARC/1.0 +WARC-Type: response +WARC-Date: 2024-05-18T01:58:10Z +WARC-Record-ID: +Content-Length: 74581 +Content-Type: application/http; msgtype=response +WARC-Warcinfo-ID: +WARC-Concurrent-To: +WARC-IP-Address: 208.80.154.224 +WARC-Target-URI: https://an.wikipedia.org/wiki/Escopete +WARC-Payload-Digest: sha1:RY7PLBUFQNI2FFV5FTUQK72W6SNPXLQU +WARC-Block-Digest: sha1:35FTUGFVNWRVTZQGCWIX2MQA3LMYC7X7 +WARC-Identified-Payload-Type: text/html + +HTTP/1.1 200 OK +date: Sat, 18 May 2024 01:58:10 GMT +server: mw-web.eqiad.canary-bb67b76b8-jtwdb +x-content-type-options: nosniff +content-language: an +origin-trial: AonOP4SwCrqpb0nhZbg554z9iJimP3DxUDB8V4yu9fyyepauGKD0NXqTknWi4gnuDfMG6hNb7TDUDTsl0mDw9gIAAABmeyJvcmlnaW4iOiJodHRwczovL3dpa2lwZWRpYS5vcmc6NDQzIiwiZmVhdHVyZSI6IlRvcExldmVsVHBjZCIsImV4cGlyeSI6MTczNTM0Mzk5OSwiaXNTdWJkb21haW4iOnRydWV9 +accept-ch: +vary: Accept-Encoding,Cookie,Authorization +last-modified: Sat, 04 May 2024 01:58:10 GMT +content-type: text/html; charset=UTF-8 +X-Crawler-content-encoding: gzip +age: 0 +x-cache: cp1106 miss, cp1106 miss +x-cache-status: miss +server-timing: cache;desc="miss", host;desc="cp1106" +strict-transport-security: max-age=106384710; includeSubDomains; preload +report-to: { "group": "wm_nel", "max_age": 604800, "endpoints": [{ "url": "https://intake-logging.wikimedia.org/v1/events?stream=w3c.reportingapi.network_error&schema_uri=/w3c/reportingapi/network_error/1.0.0" }] } +nel: { "report_to": "wm_nel", "max_age": 604800, "failure_fraction": 0.05, "success_fraction": 0.0} +set-cookie: WMF-Last-Access=18-May-2024;Path=/;HttpOnly;secure;Expires=Wed, 19 Jun 2024 00:00:00 GMT +set-cookie: WMF-Last-Access-Global=18-May-2024;Path=/;Domain=.wikipedia.org;HttpOnly;secure;Expires=Wed, 19 Jun 2024 00:00:00 GMT +set-cookie: WMF-DP=1a6;Path=/;HttpOnly;secure;Expires=Sat, 18 May 2024 00:00:00 GMT +x-client-ip: 34.239.158.223 +cache-control: private, s-maxage=0, max-age=0, must-revalidate +set-cookie: GeoIP=US:VA:Ashburn:39.05:-77.49:v4; Path=/; secure; Domain=.wikipedia.org +set-cookie: NetworkProbeLimit=0.001;Path=/;Secure;Max-Age=3600 +accept-ranges: bytes +X-Crawler-transfer-encoding: chunked +Content-Length: 72848 + @@ -932,21 +932,21 @@ Mire-se - - -WARC/1.0 -WARC-Type: metadata -WARC-Date: 2024-05-18T01:58:10Z -WARC-Record-ID: -Content-Length: 201 -Content-Type: application/warc-fields -WARC-Warcinfo-ID: -WARC-Concurrent-To: -WARC-Target-URI: https://an.wikipedia.org/wiki/Escopete - -fetchTimeMs: 258 -charset-detected: UTF-8 -languages-cld2: {"reliable":false,"text-bytes":3080,"languages":[{"code":"es","code-iso-639-3":"spa","text-covered":0.69,"score":335.0,"name":"SPANISH"}]} - - - + + +WARC/1.0 +WARC-Type: metadata +WARC-Date: 2024-05-18T01:58:10Z +WARC-Record-ID: +Content-Length: 201 +Content-Type: application/warc-fields +WARC-Warcinfo-ID: +WARC-Concurrent-To: +WARC-Target-URI: https://an.wikipedia.org/wiki/Escopete + +fetchTimeMs: 258 +charset-detected: UTF-8 +languages-cld2: {"reliable":false,"text-bytes":3080,"languages":[{"code":"es","code-iso-639-3":"spa","text-covered":0.69,"score":335.0,"name":"SPANISH"}]} + + + diff --git a/data/whirlwind.warc.wat b/data/whirlwind.warc.wat index f136ee6..f134ff4 100644 --- a/data/whirlwind.warc.wat +++ b/data/whirlwind.warc.wat @@ -1,28 +1,28 @@ -WARC/1.0 -WARC-Type: warcinfo -WARC-Date: 2024-05-31T01:16:45Z -WARC-Filename: CC-MAIN-20240517233122-20240518023122-00000.warc.wat.gz -WARC-Record-ID: -Content-Type: application/warc-fields -Content-Length: 278 - -Software-Info: ia-web-commons.1.1.10-SNAPSHOT-20240513074037 -Extracted-Date: Fri, 31 May 2024 01:16:45 GMT -ip: 10.67.67.159 -hostname: ip-10-67-67-159.ec2.internal -format: WARC File Format 1.0 -conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf - - - -WARC/1.0 -WARC-Type: metadata -WARC-Target-URI: https://an.wikipedia.org/wiki/Escopete -WARC-Date: 2024-05-31T01:17:49Z -WARC-Record-ID: -WARC-Refers-To: -Content-Type: application/json -Content-Length: 1386 - -{"Container":{"Filename":"CC-MAIN-20240517233122-20240518023122-00000.warc.gz","Compressed":true,"Offset":"80610308","Gzip-Metadata":{"Deflate-Length":"423","Header-Length":"10","Footer-Length":"8","Inflated-CRC":"1106529533","Inflated-Length":"626"}},"Envelope":{"Payload-Metadata":{"Actual-Content-Type":"application/http; msgtype=request","HTTP-Request-Metadata":{"Request-Message":{"Method":"GET","Path":"/wiki/Escopete","Version":"HTTP/1.1"},"Headers-Length":"263","Headers":{"User-Agent":"CCBot/2.0 (https://commoncrawl.org/faq/)","Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8","Accept-Language":"en-US,en;q=0.5","Accept-Encoding":"br,gzip","Host":"an.wikipedia.org","Connection":"Keep-Alive"},"Entity-Length":"0","Entity-Digest":"sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ","Entity-Trailing-Slop-Length":"0"},"Actual-Content-Length":"265","Trailing-Slop-Length":"4","Block-Digest":"sha1:IE7NEN3QEJHUCYRRGVMHDDW3BEHFRQ6V"},"Format":"WARC/1.0","WARC-Header-Length":"357","WARC-Header-Metadata":{"WARC-Type":"request","WARC-Date":"2024-05-18T01:58:10Z","WARC-Record-ID":"","Content-Length":"265","Content-Type":"application/http; msgtype=request","WARC-Warcinfo-ID":"","WARC-IP-Address":"208.80.154.224","WARC-Target-URI":"https://an.wikipedia.org/wiki/Escopete"}}} - +WARC/1.0 +WARC-Type: warcinfo +WARC-Date: 2024-05-31T01:16:45Z +WARC-Filename: CC-MAIN-20240517233122-20240518023122-00000.warc.wat.gz +WARC-Record-ID: +Content-Type: application/warc-fields +Content-Length: 278 + +Software-Info: ia-web-commons.1.1.10-SNAPSHOT-20240513074037 +Extracted-Date: Fri, 31 May 2024 01:16:45 GMT +ip: 10.67.67.159 +hostname: ip-10-67-67-159.ec2.internal +format: WARC File Format 1.0 +conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf + + + +WARC/1.0 +WARC-Type: metadata +WARC-Target-URI: https://an.wikipedia.org/wiki/Escopete +WARC-Date: 2024-05-31T01:17:49Z +WARC-Record-ID: +WARC-Refers-To: +Content-Type: application/json +Content-Length: 1386 + +{"Container":{"Filename":"CC-MAIN-20240517233122-20240518023122-00000.warc.gz","Compressed":true,"Offset":"80610308","Gzip-Metadata":{"Deflate-Length":"423","Header-Length":"10","Footer-Length":"8","Inflated-CRC":"1106529533","Inflated-Length":"626"}},"Envelope":{"Payload-Metadata":{"Actual-Content-Type":"application/http; msgtype=request","HTTP-Request-Metadata":{"Request-Message":{"Method":"GET","Path":"/wiki/Escopete","Version":"HTTP/1.1"},"Headers-Length":"263","Headers":{"User-Agent":"CCBot/2.0 (https://commoncrawl.org/faq/)","Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8","Accept-Language":"en-US,en;q=0.5","Accept-Encoding":"br,gzip","Host":"an.wikipedia.org","Connection":"Keep-Alive"},"Entity-Length":"0","Entity-Digest":"sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ","Entity-Trailing-Slop-Length":"0"},"Actual-Content-Length":"265","Trailing-Slop-Length":"4","Block-Digest":"sha1:IE7NEN3QEJHUCYRRGVMHDDW3BEHFRQ6V"},"Format":"WARC/1.0","WARC-Header-Length":"357","WARC-Header-Metadata":{"WARC-Type":"request","WARC-Date":"2024-05-18T01:58:10Z","WARC-Record-ID":"","Content-Length":"265","Content-Type":"application/http; msgtype=request","WARC-Warcinfo-ID":"","WARC-IP-Address":"208.80.154.224","WARC-Target-URI":"https://an.wikipedia.org/wiki/Escopete"}}} + diff --git a/data/whirlwind.warc.wet b/data/whirlwind.warc.wet index b8bfd45..828e68d 100644 --- a/data/whirlwind.warc.wet +++ b/data/whirlwind.warc.wet @@ -1,32 +1,32 @@ -WARC/1.0 -WARC-Type: warcinfo -WARC-Date: 2024-05-31T01:16:46Z -WARC-Filename: CC-MAIN-20240517233122-20240518023122-00000.warc.wet.gz -WARC-Record-ID: -Content-Type: application/warc-fields -Content-Length: 368 - -Software-Info: ia-web-commons.1.1.10-SNAPSHOT-20240513074037 -Extracted-Date: Fri, 31 May 2024 01:16:46 GMT -robots: checked via crawler-commons 1.5-SNAPSHOT (https://github.com/crawler-commons/crawler-commons) -isPartOf: CC-MAIN-2024-22 -operator: Common Crawl Admin (info@commoncrawl.org) -description: Wide crawl of the web for May 2024 -publisher: Common Crawl - - - -WARC/1.0 -WARC-Type: conversion -WARC-Target-URI: https://an.wikipedia.org/wiki/Escopete -WARC-Date: 2024-05-18T01:58:10Z -WARC-Record-ID: -WARC-Refers-To: -WARC-Block-Digest: sha1:RDTSR52RUHWDA7QK4BK7OUHU3EXTXYUL -WARC-Identified-Content-Language: spa -Content-Type: text/plain -Content-Length: 4456 - +WARC/1.0 +WARC-Type: warcinfo +WARC-Date: 2024-05-31T01:16:46Z +WARC-Filename: CC-MAIN-20240517233122-20240518023122-00000.warc.wet.gz +WARC-Record-ID: +Content-Type: application/warc-fields +Content-Length: 368 + +Software-Info: ia-web-commons.1.1.10-SNAPSHOT-20240513074037 +Extracted-Date: Fri, 31 May 2024 01:16:46 GMT +robots: checked via crawler-commons 1.5-SNAPSHOT (https://github.com/crawler-commons/crawler-commons) +isPartOf: CC-MAIN-2024-22 +operator: Common Crawl Admin (info@commoncrawl.org) +description: Wide crawl of the web for May 2024 +publisher: Common Crawl + + + +WARC/1.0 +WARC-Type: conversion +WARC-Target-URI: https://an.wikipedia.org/wiki/Escopete +WARC-Date: 2024-05-18T01:58:10Z +WARC-Record-ID: +WARC-Refers-To: +WARC-Block-Digest: sha1:RDTSR52RUHWDA7QK4BK7OUHU3EXTXYUL +WARC-Identified-Content-Language: spa +Content-Type: text/plain +Content-Length: 4456 + Escopete - Biquipedia, a enciclopedia libre Ir al contenido Menú principal @@ -209,5 +209,5 @@ Estatisticas Declaración de cookies Versión ta mobils Activar o desactivar el límite de anchura del contenido - - + +