Skip to content

Commit f503667

Browse files
committed
Addressed code review feedback
- URL-encode db and tool parameters in NCBI API calls - Strip PubMed query syntax characters ([]()\") from author/title query. No escape mechanism exists - Decode HTML entities and tighten tag-stripping regex in normalizeTitle() - Strip inner quotes from quote() defensively - Add container check (ensureCorrectContainer) in NotifySubmitterOfPublicationAction - Propagate PubMed ID in UpdatePublicationDetailsAction only when form doesn't already have one (e.g. URL-supplied value from notification link) - Wrap notification post and DatasetStatus update in a transaction in NotifySubmitterOfPublicationAction - Update tests
1 parent 245def2 commit f503667

2 files changed

Lines changed: 113 additions & 40 deletions

File tree

panoramapublic/src/org/labkey/panoramapublic/PanoramaPublicController.java

Lines changed: 34 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -6951,6 +6951,13 @@ public ModelAndView getView(PublicationDetailsForm form, boolean reshow, BindExc
69516951
return new SimpleErrorView(errors);
69526952
}
69536953

6954+
// Only pre-populate the PubMed ID from the copied experiment if the form doesn't already
6955+
// have one. The form may arrive pre-populated via the URL when the user clicks the
6956+
// "Make Public" link in a notification message that includes a suggested publication.
6957+
if (!form.hasPubmedId())
6958+
{
6959+
form.setPubmedId(_copiedExperiment.getPubmedId());
6960+
}
69546961
form.setLink(_copiedExperiment.getPublicationLink());
69556962
form.setCitation(_copiedExperiment.getCitation());
69566963
return getPublicationDetailsView(form, errors);
@@ -10930,6 +10937,7 @@ public boolean handlePost(NotifySubmitterForm form, BindException errors) throws
1093010937
errors.reject(ERROR_MSG, "No experiment found for Id " + form.getId());
1093110938
return false;
1093210939
}
10940+
ensureCorrectContainer(getContainer(), exptAnnotations.getContainer(), getViewContext());
1093310941

1093410942
// Check if the user has already dismissed this publication suggestion
1093510943
DatasetStatus datasetStatus = DatasetStatusManager.getForExperiment(exptAnnotations);
@@ -10988,30 +10996,34 @@ public boolean handlePost(NotifySubmitterForm form, BindException errors) throws
1098810996
notifyUsers.add(exptAnnotations.getLabHeadUser());
1098910997
}
1099010998

10991-
PanoramaPublicNotification.postPrivateDataReminderMessage(
10992-
journal, submission, exptAnnotations, submitter, getUser(), notifyUsers,
10993-
_announcement, _announcementsContainer, getUser(), selectedMatch);
10994-
10995-
// Update DatasetStatus
10996-
if (datasetStatus == null)
10999+
try (DbScope.Transaction transaction = PanoramaPublicManager.getSchema().getScope().ensureTransaction())
1099711000
{
10998-
datasetStatus = new DatasetStatus();
10999-
datasetStatus.setExperimentAnnotationsId(exptAnnotations.getId());
11000-
}
11001-
datasetStatus.setPotentialPublicationId(form.getPublicationId());
11002-
datasetStatus.setPublicationType(pubType.name());
11003-
datasetStatus.setPublicationMatchInfo(form.getMatchInfo());
11004-
datasetStatus.setCitation(selectedMatch.getCitation());
11005-
datasetStatus.setUserDismissedPublication(null);
11006-
datasetStatus.setLastReminderDate(new Date());
11001+
PanoramaPublicNotification.postPrivateDataReminderMessage(
11002+
journal, submission, exptAnnotations, submitter, getUser(), notifyUsers,
11003+
_announcement, _announcementsContainer, getUser(), selectedMatch);
1100711004

11008-
if (datasetStatus.getId() == 0)
11009-
{
11010-
DatasetStatusManager.save(datasetStatus, getUser());
11011-
}
11012-
else
11013-
{
11014-
DatasetStatusManager.update(datasetStatus, getUser());
11005+
// Update DatasetStatus
11006+
if (datasetStatus == null)
11007+
{
11008+
datasetStatus = new DatasetStatus();
11009+
datasetStatus.setExperimentAnnotationsId(exptAnnotations.getId());
11010+
}
11011+
datasetStatus.setPotentialPublicationId(form.getPublicationId());
11012+
datasetStatus.setPublicationType(pubType.name());
11013+
datasetStatus.setPublicationMatchInfo(form.getMatchInfo());
11014+
datasetStatus.setCitation(selectedMatch.getCitation());
11015+
datasetStatus.setUserDismissedPublication(null);
11016+
datasetStatus.setLastReminderDate(new Date());
11017+
11018+
if (datasetStatus.getId() == 0)
11019+
{
11020+
DatasetStatusManager.save(datasetStatus, getUser());
11021+
}
11022+
else
11023+
{
11024+
DatasetStatusManager.update(datasetStatus, getUser());
11025+
}
11026+
transaction.commit();
1101511027
}
1101611028

1101711029
return true;

panoramapublic/src/org/labkey/panoramapublic/ncbi/NcbiPublicationSearchServiceImpl.java

Lines changed: 79 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package org.labkey.panoramapublic.ncbi;
22

33
import org.apache.commons.lang3.StringUtils;
4+
import org.apache.commons.text.StringEscapeUtils;
45
import org.apache.hc.client5.http.classic.methods.HttpGet;
56
import org.apache.hc.client5.http.config.ConnectionConfig;
67
import org.apache.hc.client5.http.config.RequestConfig;
@@ -303,12 +304,10 @@ private List<String> executeSearch(String query, String database, Logger log)
303304
{
304305
String encodedQuery = URLEncoder.encode(query, StandardCharsets.UTF_8);
305306
String url = ESEARCH_URL +
306-
"?db=" + database +
307+
"?" + buildCommonParams(database) +
307308
"&term=" + encodedQuery +
308309
"&retmax=" + NcbiPublicationSearchService.MAX_RESULTS +
309-
"&retmode=json" +
310-
"&tool=" + TOOL +
311-
"&email=" + URLEncoder.encode(EMAIL, StandardCharsets.UTF_8);
310+
"&retmode=json";
312311

313312
try
314313
{
@@ -400,11 +399,9 @@ private Map<String, JSONObject> fetchMetadata(Collection<String> ids, String dat
400399

401400
String idString = String.join(",", ids);
402401
String url = ESUMMARY_URL +
403-
"?db=" + database +
402+
"?" + buildCommonParams(database) +
404403
"&id=" + URLEncoder.encode(idString, StandardCharsets.UTF_8) +
405-
"&retmode=json" +
406-
"&tool=" + TOOL +
407-
"&email=" + URLEncoder.encode(EMAIL, StandardCharsets.UTF_8);
404+
"&retmode=json";
408405

409406
try
410407
{
@@ -582,8 +579,11 @@ private List<PublicationMatch> searchPubMed(ExperimentAnnotations expAnnotations
582579
}
583580

584581
// Search PubMed: "LastName FirstName[Author] AND Title NOT preprint[Publication Type]"
582+
// PubMed has no escape mechanism for special characters like brackets or parentheses —
583+
// the recommended approach is to remove them before searching.
584+
// See: https://pubmed.ncbi.nlm.nih.gov/help/
585585
String query = String.format("%s %s[Author] AND %s NOT preprint[Publication Type]",
586-
lastName, firstName, title);
586+
stripQuerySpecialChars(lastName), stripQuerySpecialChars(firstName), stripQuerySpecialChars(title));
587587

588588
log.debug("PubMed fallback query: {}", query);
589589
List<String> pmids = searchPubMed(query, log);
@@ -781,18 +781,19 @@ private static boolean keywordsMatch(List<String> sourceKeywords, List<String> t
781781
}
782782

783783
/**
784-
* Normalize title for comparison: strip HTML tags, strip diacritics, lowercase,
785-
* replace punctuation with space (so "data-independent" becomes "data independent"),
786-
* and normalize whitespace.
784+
* Normalize title for comparison: decode HTML entities (e.g. {@code &amp;}), strip HTML tags
785+
* (e.g. {@code <i>}, {@code </i>}), strip diacritics, lowercase, replace punctuation with space
786+
* (so "data-independent" becomes "data independent"), and normalize whitespace.
787+
* The tag regex {@code </?[a-zA-Z]+>} matches letter-only tag names with no attributes e.g. {@code <i>}, {@code <sub>}).
787788
*/
788789
static String normalizeTitle(String title)
789790
{
790791
if (title == null) return "";
791792

792-
return stripDiacritics(title.toLowerCase())
793-
.replaceAll("<[^>]+>", " ") // Strip HTML/XML tags
794-
.replaceAll("[^\\w\\s]", " ") // Replace punctuation with space (not remove)
795-
.replaceAll("\\s+", " ") // Normalize whitespace
793+
return stripDiacritics(StringEscapeUtils.unescapeHtml4(title.toLowerCase())
794+
.replaceAll("</?[a-zA-Z]+>", " ")) // Strip HTML tags (e.g. <i>, </i>, <sub>)
795+
.replaceAll("[^\\w\\s]", " ") // Replace punctuation with space
796+
.replaceAll("\\s+", " ") // Normalize whitespace
796797
.trim();
797798
}
798799

@@ -916,11 +917,33 @@ private static void rateLimit()
916917
}
917918

918919
/**
919-
* Wrap string in quotes for exact match search
920+
* Remove characters that have special meaning in PubMed query syntax (brackets, parentheses, quotes).
921+
* PubMed provides no escape mechanism for these characters. The recommended approach is to remove them.
922+
* See: https://pubmed.ncbi.nlm.nih.gov/help/
923+
*/
924+
static String stripQuerySpecialChars(String value)
925+
{
926+
if (value == null) return "";
927+
return value.replaceAll("[\\[\\]()\"]", "").replaceAll("\\s+", " ").trim();
928+
}
929+
930+
/**
931+
* Build the common NCBI API parameters (db, tool, email) with URL encoding.
932+
*/
933+
private static String buildCommonParams(String database)
934+
{
935+
return "db=" + URLEncoder.encode(database, StandardCharsets.UTF_8) +
936+
"&tool=" + URLEncoder.encode(TOOL, StandardCharsets.UTF_8) +
937+
"&email=" + URLEncoder.encode(EMAIL, StandardCharsets.UTF_8);
938+
}
939+
940+
/**
941+
* Wrap string in quotes for exact phrase match search.
942+
* Any embedded double quotes are removed since NCBI provides no escape mechanism for them.
920943
*/
921944
private static String quote(String str)
922945
{
923-
return "\"" + str + "\"";
946+
return "\"" + str.replace("\"", "") + "\"";
924947
}
925948

926949
public static class TestCase extends Assert
@@ -1197,6 +1220,14 @@ public void testNormalizeTitle()
11971220
assertEquals("in vivo analysis of protein", normalizeTitle("<i>in vivo</i> analysis of protein"));
11981221
assertEquals("e coli proteomics", normalizeTitle("<i>E. coli</i> proteomics"));
11991222

1223+
// HTML entities are decoded: &amp; replaced by space (not left as "amp")
1224+
assertEquals("proteomics genomics", normalizeTitle("Proteomics &amp; Genomics"));
1225+
assertEquals("e coli analysis", normalizeTitle("&lt;i&gt;E. coli&lt;/i&gt; analysis"));
1226+
1227+
// Non-tag angle bracket expressions are NOT stripped by the tag regex; < and > are replaced by punctuation step.
1228+
assertEquals("proteins 10 kda threshold", normalizeTitle("Proteins <10 kDa threshold"));
1229+
assertEquals("proteins with mw 10 kda and 5 kda", normalizeTitle("Proteins with MW <10 kDa and >5 kDa"));
1230+
12001231
// Apostrophes are replaced with space (like other punctuation)
12011232
assertEquals("garcia s analysis", normalizeTitle("García's Analysis"));
12021233

@@ -1205,6 +1236,36 @@ public void testNormalizeTitle()
12051236
assertEquals("data independent acquisition", normalizeTitle("data independent acquisition"));
12061237
}
12071238

1239+
@Test
1240+
public void testStripQuerySpecialChars()
1241+
{
1242+
// Parentheses removed entirely (not replaced with space)
1243+
assertEquals("proteins analysis", stripQuerySpecialChars("protein(s) analysis"));
1244+
assertEquals("novel approach", stripQuerySpecialChars("[novel] approach"));
1245+
1246+
// Double quotes removed
1247+
assertEquals("the omics revolution", stripQuerySpecialChars("the \"omics\" revolution"));
1248+
1249+
// Null and empty
1250+
assertEquals("", stripQuerySpecialChars(null));
1251+
assertEquals("", stripQuerySpecialChars(""));
1252+
1253+
// No special chars — unchanged
1254+
assertEquals("Smith John", stripQuerySpecialChars("Smith John"));
1255+
1256+
// Chars removed entirely — no space inserted between adjacent text
1257+
assertEquals("ABC", stripQuerySpecialChars("A([B])C"));
1258+
}
1259+
1260+
@Test
1261+
public void testQuote()
1262+
{
1263+
assertEquals("\"PXD012345\"", quote("PXD012345"));
1264+
1265+
// Inner double quotes are removed
1266+
assertEquals("\"title with quotes\"", quote("title with \"quotes\""));
1267+
}
1268+
12081269
// -- extractPubMedId tests --
12091270

12101271
@Test

0 commit comments

Comments
 (0)