11package org .labkey .panoramapublic .ncbi ;
22
33import org .apache .commons .lang3 .StringUtils ;
4+ import org .apache .commons .text .StringEscapeUtils ;
45import org .apache .hc .client5 .http .classic .methods .HttpGet ;
56import org .apache .hc .client5 .http .config .ConnectionConfig ;
67import org .apache .hc .client5 .http .config .RequestConfig ;
@@ -303,12 +304,10 @@ private List<String> executeSearch(String query, String database, Logger log)
303304 {
304305 String encodedQuery = URLEncoder .encode (query , StandardCharsets .UTF_8 );
305306 String url = ESEARCH_URL +
306- "?db= " + database +
307+ "?" + buildCommonParams ( database ) +
307308 "&term=" + encodedQuery +
308309 "&retmax=" + NcbiPublicationSearchService .MAX_RESULTS +
309- "&retmode=json" +
310- "&tool=" + TOOL +
311- "&email=" + URLEncoder .encode (EMAIL , StandardCharsets .UTF_8 );
310+ "&retmode=json" ;
312311
313312 try
314313 {
@@ -400,11 +399,9 @@ private Map<String, JSONObject> fetchMetadata(Collection<String> ids, String dat
400399
401400 String idString = String .join ("," , ids );
402401 String url = ESUMMARY_URL +
403- "?db= " + database +
402+ "?" + buildCommonParams ( database ) +
404403 "&id=" + URLEncoder .encode (idString , StandardCharsets .UTF_8 ) +
405- "&retmode=json" +
406- "&tool=" + TOOL +
407- "&email=" + URLEncoder .encode (EMAIL , StandardCharsets .UTF_8 );
404+ "&retmode=json" ;
408405
409406 try
410407 {
@@ -582,8 +579,11 @@ private List<PublicationMatch> searchPubMed(ExperimentAnnotations expAnnotations
582579 }
583580
584581 // Search PubMed: "LastName FirstName[Author] AND Title NOT preprint[Publication Type]"
582+ // PubMed has no escape mechanism for special characters like brackets or parentheses —
583+ // the recommended approach is to remove them before searching.
584+ // See: https://pubmed.ncbi.nlm.nih.gov/help/
585585 String query = String .format ("%s %s[Author] AND %s NOT preprint[Publication Type]" ,
586- lastName , firstName , title );
586+ stripQuerySpecialChars ( lastName ), stripQuerySpecialChars ( firstName ), stripQuerySpecialChars ( title ) );
587587
588588 log .debug ("PubMed fallback query: {}" , query );
589589 List <String > pmids = searchPubMed (query , log );
@@ -781,18 +781,19 @@ private static boolean keywordsMatch(List<String> sourceKeywords, List<String> t
781781 }
782782
783783 /**
784- * Normalize title for comparison: strip HTML tags, strip diacritics, lowercase,
785- * replace punctuation with space (so "data-independent" becomes "data independent"),
786- * and normalize whitespace.
784+ * Normalize title for comparison: decode HTML entities (e.g. {@code &}), strip HTML tags
785+ * (e.g. {@code <i>}, {@code </i>}), strip diacritics, lowercase, replace punctuation with space
786+ * (so "data-independent" becomes "data independent"), and normalize whitespace.
787+ * The tag regex {@code </?[a-zA-Z]+>} matches letter-only tag names with no attributes e.g. {@code <i>}, {@code <sub>}).
787788 */
788789 static String normalizeTitle (String title )
789790 {
790791 if (title == null ) return "" ;
791792
792- return stripDiacritics (title .toLowerCase ())
793- .replaceAll ("<[^> ]+>" , " " ) // Strip HTML/XML tags
794- .replaceAll ("[^\\ w\\ s]" , " " ) // Replace punctuation with space (not remove)
795- .replaceAll ("\\ s+" , " " ) // Normalize whitespace
793+ return stripDiacritics (StringEscapeUtils . unescapeHtml4 ( title .toLowerCase ())
794+ .replaceAll ("</?[a-zA-Z ]+>" , " " )) // Strip HTML tags (e.g. <i>, </i>, <sub>)
795+ .replaceAll ("[^\\ w\\ s]" , " " ) // Replace punctuation with space
796+ .replaceAll ("\\ s+" , " " ) // Normalize whitespace
796797 .trim ();
797798 }
798799
@@ -916,11 +917,33 @@ private static void rateLimit()
916917 }
917918
918919 /**
919- * Wrap string in quotes for exact match search
920+ * Remove characters that have special meaning in PubMed query syntax (brackets, parentheses, quotes).
921+ * PubMed provides no escape mechanism for these characters. The recommended approach is to remove them.
922+ * See: https://pubmed.ncbi.nlm.nih.gov/help/
923+ */
924+ static String stripQuerySpecialChars (String value )
925+ {
926+ if (value == null ) return "" ;
927+ return value .replaceAll ("[\\ [\\ ]()\" ]" , "" ).replaceAll ("\\ s+" , " " ).trim ();
928+ }
929+
930+ /**
931+ * Build the common NCBI API parameters (db, tool, email) with URL encoding.
932+ */
933+ private static String buildCommonParams (String database )
934+ {
935+ return "db=" + URLEncoder .encode (database , StandardCharsets .UTF_8 ) +
936+ "&tool=" + URLEncoder .encode (TOOL , StandardCharsets .UTF_8 ) +
937+ "&email=" + URLEncoder .encode (EMAIL , StandardCharsets .UTF_8 );
938+ }
939+
940+ /**
941+ * Wrap string in quotes for exact phrase match search.
942+ * Any embedded double quotes are removed since NCBI provides no escape mechanism for them.
920943 */
921944 private static String quote (String str )
922945 {
923- return "\" " + str + "\" " ;
946+ return "\" " + str . replace ( " \" " , "" ) + "\" " ;
924947 }
925948
926949 public static class TestCase extends Assert
@@ -1197,6 +1220,14 @@ public void testNormalizeTitle()
11971220 assertEquals ("in vivo analysis of protein" , normalizeTitle ("<i>in vivo</i> analysis of protein" ));
11981221 assertEquals ("e coli proteomics" , normalizeTitle ("<i>E. coli</i> proteomics" ));
11991222
1223+ // HTML entities are decoded: & replaced by space (not left as "amp")
1224+ assertEquals ("proteomics genomics" , normalizeTitle ("Proteomics & Genomics" ));
1225+ assertEquals ("e coli analysis" , normalizeTitle ("<i>E. coli</i> analysis" ));
1226+
1227+ // Non-tag angle bracket expressions are NOT stripped by the tag regex; < and > are replaced by punctuation step.
1228+ assertEquals ("proteins 10 kda threshold" , normalizeTitle ("Proteins <10 kDa threshold" ));
1229+ assertEquals ("proteins with mw 10 kda and 5 kda" , normalizeTitle ("Proteins with MW <10 kDa and >5 kDa" ));
1230+
12001231 // Apostrophes are replaced with space (like other punctuation)
12011232 assertEquals ("garcia s analysis" , normalizeTitle ("García's Analysis" ));
12021233
@@ -1205,6 +1236,36 @@ public void testNormalizeTitle()
12051236 assertEquals ("data independent acquisition" , normalizeTitle ("data independent acquisition" ));
12061237 }
12071238
1239+ @ Test
1240+ public void testStripQuerySpecialChars ()
1241+ {
1242+ // Parentheses removed entirely (not replaced with space)
1243+ assertEquals ("proteins analysis" , stripQuerySpecialChars ("protein(s) analysis" ));
1244+ assertEquals ("novel approach" , stripQuerySpecialChars ("[novel] approach" ));
1245+
1246+ // Double quotes removed
1247+ assertEquals ("the omics revolution" , stripQuerySpecialChars ("the \" omics\" revolution" ));
1248+
1249+ // Null and empty
1250+ assertEquals ("" , stripQuerySpecialChars (null ));
1251+ assertEquals ("" , stripQuerySpecialChars ("" ));
1252+
1253+ // No special chars — unchanged
1254+ assertEquals ("Smith John" , stripQuerySpecialChars ("Smith John" ));
1255+
1256+ // Chars removed entirely — no space inserted between adjacent text
1257+ assertEquals ("ABC" , stripQuerySpecialChars ("A([B])C" ));
1258+ }
1259+
1260+ @ Test
1261+ public void testQuote ()
1262+ {
1263+ assertEquals ("\" PXD012345\" " , quote ("PXD012345" ));
1264+
1265+ // Inner double quotes are removed
1266+ assertEquals ("\" title with quotes\" " , quote ("title with \" quotes\" " ));
1267+ }
1268+
12081269 // -- extractPubMedId tests --
12091270
12101271 @ Test
0 commit comments