Skip to content

Commit c21db75

Browse files
authored
Fixed DOCTYPE-disallowed regression in NCBI taxonomy parsing (#623)
- Switched to the new XmlBeansUtil.DOCUMENT_BUILDER_FACTORY_ALLOWING_DOCTYPE - Added a unit test
1 parent 3d13f02 commit c21db75

File tree

2 files changed

+129
-35
lines changed

2 files changed

+129
-35
lines changed

panoramapublic/src/org/labkey/panoramapublic/PanoramaPublicModule.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
import org.labkey.panoramapublic.pipeline.PxValidationPipelineProvider;
5555
import org.labkey.panoramapublic.proteomexchange.ExperimentModificationGetter;
5656
import org.labkey.panoramapublic.proteomexchange.Formula;
57+
import org.labkey.panoramapublic.proteomexchange.NcbiUtils;
5758
import org.labkey.panoramapublic.proteomexchange.SkylineVersion;
5859
import org.labkey.panoramapublic.proteomexchange.UnimodUtil;
5960
import org.labkey.panoramapublic.proteomexchange.validator.SkylineDocValidator;
@@ -384,6 +385,7 @@ public Set<String> getSchemaNames()
384385
set.add(BlueskyApiClient.TestCase.class);
385386
set.add(PrivateDataReminderSettings.TestCase.class);
386387
set.add(NcbiPublicationSearchServiceImpl.TestCase.class);
388+
set.add(NcbiUtils.TestCase.class);
387389

388390
return set;
389391
}

panoramapublic/src/org/labkey/panoramapublic/proteomexchange/NcbiUtils.java

Lines changed: 127 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717

1818
import org.apache.commons.lang3.StringUtils;
1919
import org.json.JSONObject;
20+
import org.junit.Assert;
21+
import org.junit.Test;
2022
import org.labkey.api.collections.IntHashMap;
2123
import org.labkey.api.util.PageFlowUtil;
2224
import org.labkey.api.util.XmlBeansUtil;
@@ -32,7 +34,9 @@
3234
import javax.xml.parsers.DocumentBuilder;
3335
import javax.xml.parsers.ParserConfigurationException;
3436
import java.io.BufferedReader;
37+
import java.io.ByteArrayInputStream;
3538
import java.io.IOException;
39+
import java.io.InputStream;
3640
import java.io.InputStreamReader;
3741
import java.net.HttpURLConnection;
3842
import java.net.URL;
@@ -117,8 +121,6 @@ public static Map<Integer, String> getScientificNames(List<Integer> taxIds) thro
117121
{
118122
String queryUrl = eutilsUrl + "&id=" + StringUtils.join(taxIds, ",");
119123

120-
Map<Integer, String> sciNameMap = new IntHashMap<>();
121-
122124
HttpURLConnection conn = null;
123125
try
124126
{
@@ -130,50 +132,140 @@ public static Map<Integer, String> getScientificNames(List<Integer> taxIds) thro
130132

131133
if (status == HttpURLConnection.HTTP_OK)
132134
{
133-
DocumentBuilder builder = XmlBeansUtil.DOCUMENT_BUILDER_FACTORY.newDocumentBuilder();
134-
Document doc = builder.parse(conn.getInputStream());
135+
return parseScientificNames(conn.getInputStream());
136+
}
137+
return new IntHashMap<>();
138+
}
139+
catch (IOException | SAXException | ParserConfigurationException e)
140+
{
141+
throw new PxException("Error doing NCBI lookup for scientific names.", e);
142+
}
143+
finally
144+
{
145+
if(conn != null) conn.disconnect();
146+
}
147+
}
148+
149+
/**
150+
* Returns the {@link DocumentBuilder} used to parse NCBI eSummary responses. NCBI's response
151+
* begins with a {@code <!DOCTYPE eSummaryResult PUBLIC ... esummary-v1.dtd>} declaration, so
152+
* we use the {@code _ALLOWING_DOCTYPE} variant which permits the DOCTYPE but keeps every
153+
* other XXE-mitigation in place.
154+
*/
155+
static DocumentBuilder getDocumentBuilder() throws ParserConfigurationException
156+
{
157+
return XmlBeansUtil.DOCUMENT_BUILDER_FACTORY_ALLOWING_DOCTYPE.newDocumentBuilder();
158+
}
135159

136-
NodeList nodes = doc.getElementsByTagName("DocSum");
137-
for(int i = 0; i < nodes.getLength(); i++)
160+
// Parses an NCBI eSummary taxonomy XML response and returns a map of taxid -> scientific name.
161+
private static Map<Integer, String> parseScientificNames(InputStream in)
162+
throws ParserConfigurationException, SAXException, IOException
163+
{
164+
Document doc = getDocumentBuilder().parse(in);
165+
166+
Map<Integer, String> sciNameMap = new IntHashMap<>();
167+
NodeList nodes = doc.getElementsByTagName("DocSum");
168+
for(int i = 0; i < nodes.getLength(); i++)
169+
{
170+
Element node = (Element)nodes.item(i);
171+
Node idNode = node.getElementsByTagName("Id").item(0);
172+
String taxidStr = null;
173+
if(idNode != null)
174+
{
175+
Node taxidNode = idNode.getFirstChild();
176+
if (taxidNode instanceof CharacterData)
138177
{
139-
Element node = (Element)nodes.item(i);
140-
Node idNode = node.getElementsByTagName("Id").item(0);
141-
String taxidStr = null;
142-
if(idNode != null)
143-
{
144-
Node taxidNode = idNode.getFirstChild();
145-
if (taxidNode instanceof CharacterData)
146-
{
147-
taxidStr = ((CharacterData) taxidNode).getData();
148-
}
149-
}
178+
taxidStr = ((CharacterData) taxidNode).getData();
179+
}
180+
}
150181

151-
NodeList children = node.getElementsByTagName("Item");
152-
for(int j = 0; j < children.getLength(); j++)
182+
NodeList children = node.getElementsByTagName("Item");
183+
for(int j = 0; j < children.getLength(); j++)
184+
{
185+
Element child = (Element)children.item(j);
186+
if(!StringUtils.isBlank(taxidStr) && ("ScientificName").equalsIgnoreCase(child.getAttribute("Name")))
187+
{
188+
Node sciName = child.getFirstChild();
189+
if(sciName instanceof CharacterData)
153190
{
154-
Element child = (Element)children.item(j);
155-
if(!StringUtils.isBlank(taxidStr) && ("ScientificName").equalsIgnoreCase(child.getAttribute("Name")))
156-
{
157-
Node sciName = child.getFirstChild();
158-
if(sciName instanceof CharacterData)
159-
{
160-
Integer taxid = Integer.parseInt(taxidStr);
161-
sciNameMap.put(taxid, ((CharacterData) sciName).getData());
162-
break;
163-
}
164-
}
191+
Integer taxid = Integer.parseInt(taxidStr);
192+
sciNameMap.put(taxid, ((CharacterData) sciName).getData());
193+
break;
165194
}
166195
}
167196
}
168197
}
169-
catch (IOException | SAXException | ParserConfigurationException e)
198+
return sciNameMap;
199+
}
200+
201+
public static class TestCase extends Assert
202+
{
203+
// NCBI esummary taxonomy response captured from
204+
// https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=taxonomy&id=9606,10090,4932
205+
private static final String ESUMMARY_TAXONOMY_RESPONSE =
206+
"<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n" +
207+
"<!DOCTYPE eSummaryResult PUBLIC \"-//NLM//DTD esummary v1 20041029//EN\" \"https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20041029/esummary-v1.dtd\">\n" +
208+
"<eSummaryResult>\n" +
209+
"<DocSum>\n" +
210+
" <Id>9606</Id>\n" +
211+
" <Item Name=\"Status\" Type=\"String\">active</Item>\n" +
212+
" <Item Name=\"Rank\" Type=\"String\">species</Item>\n" +
213+
" <Item Name=\"Division\" Type=\"String\">primates</Item>\n" +
214+
" <Item Name=\"ScientificName\" Type=\"String\">Homo sapiens</Item>\n" +
215+
" <Item Name=\"CommonName\" Type=\"String\">human</Item>\n" +
216+
" <Item Name=\"TaxId\" Type=\"Integer\">9606</Item>\n" +
217+
" <Item Name=\"AkaTaxId\" Type=\"Integer\">0</Item>\n" +
218+
" <Item Name=\"Genus\" Type=\"String\"></Item>\n" +
219+
" <Item Name=\"Species\" Type=\"String\"></Item>\n" +
220+
" <Item Name=\"Subsp\" Type=\"String\"></Item>\n" +
221+
" <Item Name=\"ModificationDate\" Type=\"Date\">2024/09/10 00:00</Item>\n" +
222+
"</DocSum>\n" +
223+
"\n" +
224+
"<DocSum>\n" +
225+
" <Id>10090</Id>\n" +
226+
" <Item Name=\"Status\" Type=\"String\">active</Item>\n" +
227+
" <Item Name=\"Rank\" Type=\"String\">species</Item>\n" +
228+
" <Item Name=\"Division\" Type=\"String\">rodents</Item>\n" +
229+
" <Item Name=\"ScientificName\" Type=\"String\">Mus musculus</Item>\n" +
230+
" <Item Name=\"CommonName\" Type=\"String\">house mouse</Item>\n" +
231+
" <Item Name=\"TaxId\" Type=\"Integer\">10090</Item>\n" +
232+
" <Item Name=\"AkaTaxId\" Type=\"Integer\">0</Item>\n" +
233+
" <Item Name=\"Genus\" Type=\"String\"></Item>\n" +
234+
" <Item Name=\"Species\" Type=\"String\"></Item>\n" +
235+
" <Item Name=\"Subsp\" Type=\"String\"></Item>\n" +
236+
" <Item Name=\"ModificationDate\" Type=\"Date\">2025/06/16 00:00</Item>\n" +
237+
"</DocSum>\n" +
238+
"\n" +
239+
"<DocSum>\n" +
240+
" <Id>4932</Id>\n" +
241+
" <Item Name=\"Status\" Type=\"String\">active</Item>\n" +
242+
" <Item Name=\"Rank\" Type=\"String\">species</Item>\n" +
243+
" <Item Name=\"Division\" Type=\"String\">budding yeasts &amp; allies</Item>\n" +
244+
" <Item Name=\"ScientificName\" Type=\"String\">Saccharomyces cerevisiae</Item>\n" +
245+
" <Item Name=\"CommonName\" Type=\"String\">brewer's yeast</Item>\n" +
246+
" <Item Name=\"TaxId\" Type=\"Integer\">4932</Item>\n" +
247+
" <Item Name=\"AkaTaxId\" Type=\"Integer\">0</Item>\n" +
248+
" <Item Name=\"Genus\" Type=\"String\"></Item>\n" +
249+
" <Item Name=\"Species\" Type=\"String\"></Item>\n" +
250+
" <Item Name=\"Subsp\" Type=\"String\"></Item>\n" +
251+
" <Item Name=\"ModificationDate\" Type=\"Date\">2025/08/11 00:00</Item>\n" +
252+
"</DocSum>\n" +
253+
"\n" +
254+
"</eSummaryResult>\n";
255+
256+
@Test
257+
public void testParseScientificNames() throws Exception
170258
{
171-
throw new PxException("Error doing NCBI lookup for scientific names.", e);
259+
Map<Integer, String> names = parseScientificNames(toStream(ESUMMARY_TAXONOMY_RESPONSE));
260+
assertEquals(3, names.size());
261+
assertEquals("Homo sapiens", names.get(9606));
262+
assertEquals("Mus musculus", names.get(10090));
263+
assertEquals("Saccharomyces cerevisiae", names.get(4932));
172264
}
173-
finally
265+
266+
private static InputStream toStream(String xml)
174267
{
175-
if(conn != null) conn.disconnect();
268+
return new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8));
176269
}
177-
return sciNameMap;
178270
}
179271
}

0 commit comments

Comments
 (0)