From 1aeb4a836acab98820117f317c2062c408882d89 Mon Sep 17 00:00:00 2001 From: Alex Date: Tue, 28 Feb 2023 11:29:46 -0600 Subject: [PATCH 1/5] reverted filtering method for psms passed to FlashLFQ --- .../SearchTask/PostSearchAnalysisTask.cs | 20 +++++-------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/TaskLayer/SearchTask/PostSearchAnalysisTask.cs b/TaskLayer/SearchTask/PostSearchAnalysisTask.cs index a535bae1f..5d85db40c 100644 --- a/TaskLayer/SearchTask/PostSearchAnalysisTask.cs +++ b/TaskLayer/SearchTask/PostSearchAnalysisTask.cs @@ -254,21 +254,11 @@ private void QuantificationAnalysis() } // get PSMs to pass to FlashLFQ - List unambiguousPsmsBelowOnePercentFdr = new(); - if (Parameters.AllPsms.Count > 100)//PEP is not computed when there are fewer than 100 psms - { - unambiguousPsmsBelowOnePercentFdr = Parameters.AllPsms.Where(p => - p.FdrInfo.PEP_QValue <= 0.01 - && !p.IsDecoy - && p.FullSequence != null).ToList(); //if ambiguous, there's no full sequence - } - else - { - unambiguousPsmsBelowOnePercentFdr = Parameters.AllPsms.Where(p => - p.FdrInfo.QValue <= 0.01 - && !p.IsDecoy - && p.FullSequence != null).ToList(); //if ambiguous, there's no full sequence - } + var unambiguousPsmsBelowOnePercentFdr = Parameters.AllPsms.Where(p => + p.FdrInfo.QValue <= 0.01 + && p.FdrInfo.QValueNotch <= 0.01 + && !p.IsDecoy + && p.FullSequence != null).ToList(); // pass protein group info for each PSM var psmToProteinGroups = new Dictionary>(); From 6d6ec6e2d0baae866f1be2cfeaa5b580ad61966b Mon Sep 17 00:00:00 2001 From: Michael Shortreed Date: Thu, 3 Aug 2023 15:15:09 -0500 Subject: [PATCH 2/5] functional but not final --- .../FdrAnalysis/FdrAnalysisEngine.cs | 58 ++++++++++++++++++- 1 file changed, 55 insertions(+), 3 deletions(-) diff --git a/MetaMorpheus/EngineLayer/FdrAnalysis/FdrAnalysisEngine.cs b/MetaMorpheus/EngineLayer/FdrAnalysis/FdrAnalysisEngine.cs index 825216019..5d466b066 100644 --- a/MetaMorpheus/EngineLayer/FdrAnalysis/FdrAnalysisEngine.cs +++ b/MetaMorpheus/EngineLayer/FdrAnalysis/FdrAnalysisEngine.cs @@ -3,6 +3,8 @@ using System.Collections.Generic; using System.Linq; using System.Threading; +using Easy.Common.Extensions; +using MathNet.Numerics; namespace EngineLayer.FdrAnalysis { @@ -35,7 +37,7 @@ protected override MetaMorpheusEngineResults RunSpecific() Status("Running FDR analysis..."); DoFalseDiscoveryRateAnalysis(myAnalysisResults); - + myAnalysisResults.PsmsWithin1PercentFdr = AllPsms.Count(b => b.FdrInfo.QValue <= 0.01 && !b.IsDecoy); return myAnalysisResults; @@ -45,10 +47,10 @@ private void DoFalseDiscoveryRateAnalysis(FdrAnalysisResults myAnalysisResults) { // Stop if canceled if (GlobalVariables.StopLoops) { return; } - + var bubba = EValueByTailFittingForTopPsms(AllPsms.ToList()); // calculate FDR on a per-protease basis (targets and decoys for a specific protease) var psmsGroupedByProtease = AllPsms.GroupBy(p => p.DigestionParams.Protease); - + foreach (var proteasePsms in psmsGroupedByProtease) { var psms = proteasePsms.ToList(); @@ -166,6 +168,56 @@ private void DoFalseDiscoveryRateAnalysis(FdrAnalysisResults myAnalysisResults) } } + public double[] EValueByTailFittingForTopPsms(List allPSMs) + { + int myCount = allPSMs.Count; + var decoyScoreHistogram = allPSMs + .Where(p => p.IsDecoy) //we are fitting the tail to only decoy PSMs + .Select(p => (int)p.Score) //we are only interested in the integer score because the decimal portion is unrelated + .GroupBy(s => s).ToList(); //making a score histogram here. + var j = decoyScoreHistogram.Count; + + double[] survival = new double[decoyScoreHistogram.Select(k=>k.Key).ToList().Max() + 1]; + + foreach (var scoreCountPair in decoyScoreHistogram) + { + survival[scoreCountPair.Key] = scoreCountPair.Count(); + } + + List logScores = new List(); + List logSurvivals = new List(); + + double runningSum = 0; + for (int i = survival.Length - 1; i > -1; i--) + { + runningSum += survival[i]; + survival[i] = runningSum; + } + + double countMax = survival.Max(); + + for (int i = 0; i < survival.Length; i++) + { + survival[i] /= countMax; + } + + double[] logSurvival = new double[survival.Length]; + for (int i = 0; i < survival.Length; i++) + { + if (survival[i] > 0 && survival[i] < (0.1 * survival.Max())) + { + logSurvival[i] = Math.Log10(survival[i]); + logScores.Add(Math.Log10(i)); + logSurvivals.Add(Math.Log10(survival[i])); + } + } + + (double intercept,double slope) p = Fit.Line(logScores.ToArray(), logSurvivals.ToArray()); + + + return logSurvival; + } + public void Compute_PEPValue(FdrAnalysisResults myAnalysisResults) { if (AnalysisType == "PSM") From 1c5e367da75a6ba721a2e6d4431eb40fcbd75195 Mon Sep 17 00:00:00 2001 From: Michael Shortreed Date: Fri, 4 Aug 2023 11:55:34 -0500 Subject: [PATCH 3/5] some useful comments --- .../FdrAnalysis/FdrAnalysisEngine.cs | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/MetaMorpheus/EngineLayer/FdrAnalysis/FdrAnalysisEngine.cs b/MetaMorpheus/EngineLayer/FdrAnalysis/FdrAnalysisEngine.cs index 5d466b066..24b3795d5 100644 --- a/MetaMorpheus/EngineLayer/FdrAnalysis/FdrAnalysisEngine.cs +++ b/MetaMorpheus/EngineLayer/FdrAnalysis/FdrAnalysisEngine.cs @@ -167,8 +167,16 @@ private void DoFalseDiscoveryRateAnalysis(FdrAnalysisResults myAnalysisResults) Compute_PEPValue(myAnalysisResults); } } - - public double[] EValueByTailFittingForTopPsms(List allPSMs) + /// + /// The method fits a line for log[survival] vs. log[(int)score] for the top 10% scoring spectrum matches + /// This line can be used to compute the E-value of a spectrum match by inputing the log[(int)score] + /// and raising 10 the the power of the computed value (10^y) + /// For high scoring spectrum matches (~the set of spectrum matches at 1% FDR), this value should be <= 0 + /// This function should not used to calculate E-Value for anything with greater than 1% FDR + /// + /// + /// + public (double intercept, double slope) EValueRegressionFormulaByTailFittingForTopPsms(List allPSMs) { int myCount = allPSMs.Count; var decoyScoreHistogram = allPSMs @@ -212,10 +220,7 @@ public double[] EValueByTailFittingForTopPsms(List allPSMs } } - (double intercept,double slope) p = Fit.Line(logScores.ToArray(), logSurvivals.ToArray()); - - - return logSurvival; + return Fit.Line(logScores.ToArray(), logSurvivals.ToArray()); } public void Compute_PEPValue(FdrAnalysisResults myAnalysisResults) From a5f46c7689f59eddc4ab5ce05a15d830d70a7b0e Mon Sep 17 00:00:00 2001 From: Michael Shortreed Date: Fri, 4 Aug 2023 12:01:59 -0500 Subject: [PATCH 4/5] more comments --- .../EngineLayer/FdrAnalysis/FdrAnalysisEngine.cs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/MetaMorpheus/EngineLayer/FdrAnalysis/FdrAnalysisEngine.cs b/MetaMorpheus/EngineLayer/FdrAnalysis/FdrAnalysisEngine.cs index 24b3795d5..1f08b27a4 100644 --- a/MetaMorpheus/EngineLayer/FdrAnalysis/FdrAnalysisEngine.cs +++ b/MetaMorpheus/EngineLayer/FdrAnalysis/FdrAnalysisEngine.cs @@ -173,6 +173,10 @@ private void DoFalseDiscoveryRateAnalysis(FdrAnalysisResults myAnalysisResults) /// and raising 10 the the power of the computed value (10^y) /// For high scoring spectrum matches (~the set of spectrum matches at 1% FDR), this value should be <= 0 /// This function should not used to calculate E-Value for anything with greater than 1% FDR + /// I found this strategy in an asms presentation pdf from 2006 + /// https://prospector.ucsf.edu/prospector/html/misc/publications/2006_ASMS_1.pdf + /// Protein Prospector and Ways Calculating Expectation Values + /// Aenoch J. Lynn; Robert J. Chalkley; Peter R. Baker; Mark R.Segal; and Alma L.Burlingame /// /// /// @@ -189,11 +193,11 @@ private void DoFalseDiscoveryRateAnalysis(FdrAnalysisResults myAnalysisResults) foreach (var scoreCountPair in decoyScoreHistogram) { - survival[scoreCountPair.Key] = scoreCountPair.Count(); + survival[scoreCountPair.Key] = scoreCountPair.Count();//the array already has a value of 0 at each index (which is the integer Morpheus score) during creation. so we only need to populate it where we have scores } - List logScores = new List(); - List logSurvivals = new List(); + List logScores = new List(); //x-values + List logSurvivals = new List(); //y-values double runningSum = 0; for (int i = survival.Length - 1; i > -1; i--) From 8abe4fa1cc5a1378fc28b656b6778cc21e7218fb Mon Sep 17 00:00:00 2001 From: Michael Shortreed Date: Tue, 8 Aug 2023 12:48:42 -0500 Subject: [PATCH 5/5] ttyi --- MetaMorpheus/EngineLayer/FdrAnalysis/FdrAnalysisEngine.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MetaMorpheus/EngineLayer/FdrAnalysis/FdrAnalysisEngine.cs b/MetaMorpheus/EngineLayer/FdrAnalysis/FdrAnalysisEngine.cs index 1f08b27a4..775225518 100644 --- a/MetaMorpheus/EngineLayer/FdrAnalysis/FdrAnalysisEngine.cs +++ b/MetaMorpheus/EngineLayer/FdrAnalysis/FdrAnalysisEngine.cs @@ -196,8 +196,8 @@ private void DoFalseDiscoveryRateAnalysis(FdrAnalysisResults myAnalysisResults) survival[scoreCountPair.Key] = scoreCountPair.Count();//the array already has a value of 0 at each index (which is the integer Morpheus score) during creation. so we only need to populate it where we have scores } - List logScores = new List(); //x-values - List logSurvivals = new List(); //y-values + List logScores = new(); //x-values + List logSurvivals = new(); //y-values double runningSum = 0; for (int i = survival.Length - 1; i > -1; i--)