-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathadd_kit.py
More file actions
162 lines (129 loc) · 5.54 KB
/
add_kit.py
File metadata and controls
162 lines (129 loc) · 5.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import argparse
import os
import requests
from bs4 import BeautifulSoup as bs
from urllib.parse import urlparse, urlunparse
import json
def normalize_url(url: str) -> str:
"""
Normalize URL: remove trailing # or anything, ensure it ends with a slash.
"""
# Parse the URL
parsed = urlparse(url)
# Remove fragment (#) and query parameters
normalized = urlunparse((parsed.scheme, parsed.netloc, parsed.path, '', '', '')) # params # query # fragment
# Ensure it ends with a slash
if not normalized.endswith('/'):
normalized += '/'
return normalized
def scrape_addgene_kit(url: str) -> str:
"""
Fetch Addgene kit page content using a simple HTTP request.
"""
response = requests.get(url)
response.raise_for_status() # Raise an exception for bad status codes
return response.text
def sanitize_string(text: str) -> str:
"""Sanitize string by trimming whitespace."""
return text.strip()
def extract_plasmids_from_page(page_content: str) -> list[tuple[str, str, str, str]]:
"""
Extract plasmid information from Addgene kit page HTML.
Replicates the JavaScript logic:
- Finds the last table with class 'kit-inventory-table' in '#kit-contents'
- Validates headers are 'Well', 'Plasmid', 'Resistance'
- Extracts plasmid data from tbody rows
"""
soup = bs(page_content, "html.parser")
kit_title = soup.find("h1", id="kit-title")
if not kit_title:
raise Exception("The html document does not have the expected structure: #kit-title not found")
kit_title = kit_title.get_text(strip=True)
# Find all tables with class 'kit-inventory-table' inside '#kit-contents'
kit_contents = soup.find(id="kit-contents")
if not kit_contents:
raise Exception("The html document does not have the expected structure: #kit-contents not found")
elements = kit_contents.find_all("table", class_="kit-inventory-table")
if not elements:
raise Exception("The html document does not have the expected structure: kit-inventory-table not found")
# Get the last table (sometimes there is a table with only headers on top)
kit_table = elements[-1]
# Check that the headers of the table are Well, Plasmid and Resistance
headers = kit_table.select("thead th")
if len(headers) != 3:
raise Exception("The html document does not have the expected structure: expected 3 headers")
header_text = [header.get_text(strip=True) for header in headers]
if header_text[0] != "Well" or header_text[1] != "Plasmid" or header_text[2] != "Resistance":
raise Exception("The table does not have the expected headers")
# Get the plasmid info
plasmids = []
tbody = kit_table.find("tbody")
if not tbody:
raise Exception("The table does not have a tbody")
rows = tbody.find_all("tr")
for row in rows:
cells = row.find_all("td")
if len(cells) < 3:
continue # Skip incomplete rows
# Extract addgene_id from the link in the second cell
second_cell = cells[1]
link = second_cell.find("a")
if not link or not link.get("href"):
continue # Skip rows without links
href = link.get("href")
# Extract ID from URL (handle trailing slash)
# For now, it seems that ids are /id/ (they have trailing slash)
# But it is better to be safe and pop twice, since first pop will
# be false if there is no trailing slash
id_href = href.split("/")
# Pop twice to handle trailing slash (first pop may be empty string)
# Match JavaScript behavior: pop() returns undefined for empty array (falsy)
first_pop = id_href.pop() if id_href else None
addgene_id = first_pop or (id_href.pop() if id_href else None)
if not addgene_id:
continue # Skip if we can't extract ID
plasmids.append(
(
sanitize_string(cells[0].get_text()),
sanitize_string(cells[1].get_text()),
addgene_id,
sanitize_string(cells[2].get_text()),
)
)
return kit_title, plasmids
def get_kit_dirname(url: str) -> str:
"Turn the url into a directory name"
parsed = urlparse(url)
path = parsed.path
if path.startswith("/kits/"):
path = path[6:]
# Remove the leading and trailing slashes
path = path.strip("/")
# Replace slashes with underscores
path = path.replace("/", "_")
return path
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Scrape Addgene kit page and extract plasmid information")
parser.add_argument("url", help="URL of the Addgene kit page")
args = parser.parse_args()
if not args.url.startswith("https://www.addgene.org/"):
raise ValueError("The URL must start with https://www.addgene.org/")
normalized_url = normalize_url(args.url)
kit_name = get_kit_dirname(normalized_url)
page_content = scrape_addgene_kit(normalized_url)
kit_title, plasmids = extract_plasmids_from_page(page_content)
if not os.path.exists(f"kits/{kit_name}"):
os.makedirs(f"kits/{kit_name}")
with open(f"kits/{kit_name}/plasmids.tsv", "w") as f:
f.write("\t".join(["well", "name", "addgene_id", "resistance"]) + "\n")
for plasmid in plasmids:
f.write("\t".join(plasmid) + "\n")
with open(f"kits/{kit_name}/info.json", "w") as f:
json.dump(
{
"title": kit_title,
"url": normalized_url,
},
f,
indent=4,
)