1717
1818import org .apache .commons .lang3 .StringUtils ;
1919import org .json .JSONObject ;
20+ import org .junit .Assert ;
21+ import org .junit .Test ;
2022import org .labkey .api .collections .IntHashMap ;
2123import org .labkey .api .util .PageFlowUtil ;
2224import org .labkey .api .util .XmlBeansUtil ;
3234import javax .xml .parsers .DocumentBuilder ;
3335import javax .xml .parsers .ParserConfigurationException ;
3436import java .io .BufferedReader ;
37+ import java .io .ByteArrayInputStream ;
3538import java .io .IOException ;
39+ import java .io .InputStream ;
3640import java .io .InputStreamReader ;
3741import java .net .HttpURLConnection ;
3842import java .net .URL ;
@@ -117,8 +121,6 @@ public static Map<Integer, String> getScientificNames(List<Integer> taxIds) thro
117121 {
118122 String queryUrl = eutilsUrl + "&id=" + StringUtils .join (taxIds , "," );
119123
120- Map <Integer , String > sciNameMap = new IntHashMap <>();
121-
122124 HttpURLConnection conn = null ;
123125 try
124126 {
@@ -130,50 +132,140 @@ public static Map<Integer, String> getScientificNames(List<Integer> taxIds) thro
130132
131133 if (status == HttpURLConnection .HTTP_OK )
132134 {
133- DocumentBuilder builder = XmlBeansUtil .DOCUMENT_BUILDER_FACTORY .newDocumentBuilder ();
134- Document doc = builder .parse (conn .getInputStream ());
135+ return parseScientificNames (conn .getInputStream ());
136+ }
137+ return new IntHashMap <>();
138+ }
139+ catch (IOException | SAXException | ParserConfigurationException e )
140+ {
141+ throw new PxException ("Error doing NCBI lookup for scientific names." , e );
142+ }
143+ finally
144+ {
145+ if (conn != null ) conn .disconnect ();
146+ }
147+ }
148+
149+ /**
150+ * Returns the {@link DocumentBuilder} used to parse NCBI eSummary responses. NCBI's response
151+ * begins with a {@code <!DOCTYPE eSummaryResult PUBLIC ... esummary-v1.dtd>} declaration, so
152+ * we use the {@code _ALLOWING_DOCTYPE} variant which permits the DOCTYPE but keeps every
153+ * other XXE-mitigation in place.
154+ */
155+ static DocumentBuilder getDocumentBuilder () throws ParserConfigurationException
156+ {
157+ return XmlBeansUtil .DOCUMENT_BUILDER_FACTORY_ALLOWING_DOCTYPE .newDocumentBuilder ();
158+ }
135159
136- NodeList nodes = doc .getElementsByTagName ("DocSum" );
137- for (int i = 0 ; i < nodes .getLength (); i ++)
160+ // Parses an NCBI eSummary taxonomy XML response and returns a map of taxid -> scientific name.
161+ private static Map <Integer , String > parseScientificNames (InputStream in )
162+ throws ParserConfigurationException , SAXException , IOException
163+ {
164+ Document doc = getDocumentBuilder ().parse (in );
165+
166+ Map <Integer , String > sciNameMap = new IntHashMap <>();
167+ NodeList nodes = doc .getElementsByTagName ("DocSum" );
168+ for (int i = 0 ; i < nodes .getLength (); i ++)
169+ {
170+ Element node = (Element )nodes .item (i );
171+ Node idNode = node .getElementsByTagName ("Id" ).item (0 );
172+ String taxidStr = null ;
173+ if (idNode != null )
174+ {
175+ Node taxidNode = idNode .getFirstChild ();
176+ if (taxidNode instanceof CharacterData )
138177 {
139- Element node = (Element )nodes .item (i );
140- Node idNode = node .getElementsByTagName ("Id" ).item (0 );
141- String taxidStr = null ;
142- if (idNode != null )
143- {
144- Node taxidNode = idNode .getFirstChild ();
145- if (taxidNode instanceof CharacterData )
146- {
147- taxidStr = ((CharacterData ) taxidNode ).getData ();
148- }
149- }
178+ taxidStr = ((CharacterData ) taxidNode ).getData ();
179+ }
180+ }
150181
151- NodeList children = node .getElementsByTagName ("Item" );
152- for (int j = 0 ; j < children .getLength (); j ++)
182+ NodeList children = node .getElementsByTagName ("Item" );
183+ for (int j = 0 ; j < children .getLength (); j ++)
184+ {
185+ Element child = (Element )children .item (j );
186+ if (!StringUtils .isBlank (taxidStr ) && ("ScientificName" ).equalsIgnoreCase (child .getAttribute ("Name" )))
187+ {
188+ Node sciName = child .getFirstChild ();
189+ if (sciName instanceof CharacterData )
153190 {
154- Element child = (Element )children .item (j );
155- if (!StringUtils .isBlank (taxidStr ) && ("ScientificName" ).equalsIgnoreCase (child .getAttribute ("Name" )))
156- {
157- Node sciName = child .getFirstChild ();
158- if (sciName instanceof CharacterData )
159- {
160- Integer taxid = Integer .parseInt (taxidStr );
161- sciNameMap .put (taxid , ((CharacterData ) sciName ).getData ());
162- break ;
163- }
164- }
191+ Integer taxid = Integer .parseInt (taxidStr );
192+ sciNameMap .put (taxid , ((CharacterData ) sciName ).getData ());
193+ break ;
165194 }
166195 }
167196 }
168197 }
169- catch (IOException | SAXException | ParserConfigurationException e )
198+ return sciNameMap ;
199+ }
200+
201+ public static class TestCase extends Assert
202+ {
203+ // NCBI esummary taxonomy response captured from
204+ // https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=taxonomy&id=9606,10090,4932
205+ private static final String ESUMMARY_TAXONOMY_RESPONSE =
206+ "<?xml version=\" 1.0\" encoding=\" UTF-8\" ?>\n " +
207+ "<!DOCTYPE eSummaryResult PUBLIC \" -//NLM//DTD esummary v1 20041029//EN\" \" https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20041029/esummary-v1.dtd\" >\n " +
208+ "<eSummaryResult>\n " +
209+ "<DocSum>\n " +
210+ " <Id>9606</Id>\n " +
211+ " <Item Name=\" Status\" Type=\" String\" >active</Item>\n " +
212+ " <Item Name=\" Rank\" Type=\" String\" >species</Item>\n " +
213+ " <Item Name=\" Division\" Type=\" String\" >primates</Item>\n " +
214+ " <Item Name=\" ScientificName\" Type=\" String\" >Homo sapiens</Item>\n " +
215+ " <Item Name=\" CommonName\" Type=\" String\" >human</Item>\n " +
216+ " <Item Name=\" TaxId\" Type=\" Integer\" >9606</Item>\n " +
217+ " <Item Name=\" AkaTaxId\" Type=\" Integer\" >0</Item>\n " +
218+ " <Item Name=\" Genus\" Type=\" String\" ></Item>\n " +
219+ " <Item Name=\" Species\" Type=\" String\" ></Item>\n " +
220+ " <Item Name=\" Subsp\" Type=\" String\" ></Item>\n " +
221+ " <Item Name=\" ModificationDate\" Type=\" Date\" >2024/09/10 00:00</Item>\n " +
222+ "</DocSum>\n " +
223+ "\n " +
224+ "<DocSum>\n " +
225+ " <Id>10090</Id>\n " +
226+ " <Item Name=\" Status\" Type=\" String\" >active</Item>\n " +
227+ " <Item Name=\" Rank\" Type=\" String\" >species</Item>\n " +
228+ " <Item Name=\" Division\" Type=\" String\" >rodents</Item>\n " +
229+ " <Item Name=\" ScientificName\" Type=\" String\" >Mus musculus</Item>\n " +
230+ " <Item Name=\" CommonName\" Type=\" String\" >house mouse</Item>\n " +
231+ " <Item Name=\" TaxId\" Type=\" Integer\" >10090</Item>\n " +
232+ " <Item Name=\" AkaTaxId\" Type=\" Integer\" >0</Item>\n " +
233+ " <Item Name=\" Genus\" Type=\" String\" ></Item>\n " +
234+ " <Item Name=\" Species\" Type=\" String\" ></Item>\n " +
235+ " <Item Name=\" Subsp\" Type=\" String\" ></Item>\n " +
236+ " <Item Name=\" ModificationDate\" Type=\" Date\" >2025/06/16 00:00</Item>\n " +
237+ "</DocSum>\n " +
238+ "\n " +
239+ "<DocSum>\n " +
240+ " <Id>4932</Id>\n " +
241+ " <Item Name=\" Status\" Type=\" String\" >active</Item>\n " +
242+ " <Item Name=\" Rank\" Type=\" String\" >species</Item>\n " +
243+ " <Item Name=\" Division\" Type=\" String\" >budding yeasts & allies</Item>\n " +
244+ " <Item Name=\" ScientificName\" Type=\" String\" >Saccharomyces cerevisiae</Item>\n " +
245+ " <Item Name=\" CommonName\" Type=\" String\" >brewer's yeast</Item>\n " +
246+ " <Item Name=\" TaxId\" Type=\" Integer\" >4932</Item>\n " +
247+ " <Item Name=\" AkaTaxId\" Type=\" Integer\" >0</Item>\n " +
248+ " <Item Name=\" Genus\" Type=\" String\" ></Item>\n " +
249+ " <Item Name=\" Species\" Type=\" String\" ></Item>\n " +
250+ " <Item Name=\" Subsp\" Type=\" String\" ></Item>\n " +
251+ " <Item Name=\" ModificationDate\" Type=\" Date\" >2025/08/11 00:00</Item>\n " +
252+ "</DocSum>\n " +
253+ "\n " +
254+ "</eSummaryResult>\n " ;
255+
256+ @ Test
257+ public void testParseScientificNames () throws Exception
170258 {
171- throw new PxException ("Error doing NCBI lookup for scientific names." , e );
259+ Map <Integer , String > names = parseScientificNames (toStream (ESUMMARY_TAXONOMY_RESPONSE ));
260+ assertEquals (3 , names .size ());
261+ assertEquals ("Homo sapiens" , names .get (9606 ));
262+ assertEquals ("Mus musculus" , names .get (10090 ));
263+ assertEquals ("Saccharomyces cerevisiae" , names .get (4932 ));
172264 }
173- finally
265+
266+ private static InputStream toStream (String xml )
174267 {
175- if ( conn != null ) conn . disconnect ( );
268+ return new ByteArrayInputStream ( xml . getBytes ( StandardCharsets . UTF_8 ) );
176269 }
177- return sciNameMap ;
178270 }
179271}
0 commit comments