N3PDF · Radonirinaunimi · Mar 22, 2021 · Mar 23, 2021 · Mar 23, 2021 · Mar 24, 2021
diff --git a/runcards/runcard.yml b/runcards/runcard.yml
@@ -2,13 +2,13 @@
 # PDF Set                                         #
 ###################################################
 pdfsetting:
-  pdf: NNPDF40_nnlo_as_0118_1000
+  pdf: 210219-02-rs-nnpdf40-1000
   existing_enhanced: False
 
 ###################################################
 # Size of compressed PDF replicas                 #
 ###################################################
-compressed: 500
+compressed: 100
 
 ###################################################
 # Choice of Minimizer                             #

diff --git a/src/pycompressor/compressing.py b/src/pycompressor/compressing.py
@@ -15,14 +15,18 @@
 from pycompressor.pdfgrid import XGrid
 from pycompressor.pdfgrid import PdfSet
 from pycompressor.compressor import Compress
+from pycompressor.utils import map_index
 from pycompressor.utils import extract_index
+from pycompressor.utils import preprocess_enhanced
+from pycompressor.utils import restore_permutation
 from pycompressor.estimators import ALLOWED_ESTIMATORS
 
+
 console = Console()
 log = logging.getLogger(__name__)
 
 # Initial scale (in GeV)
-Q0 = 1
+Q0 = 1.65
 # Total number of flavour to 2nf+1=7
 NF = 4
 
@@ -37,11 +41,11 @@ def splash():
 
     style = Style(color="blue")
     logo = Table(show_header=True, header_style="bold blue", style=style)
-    logo.add_column("𝖕𝖞𝕮𝖔𝖒𝖕𝖗𝖊𝖘𝖘𝖔𝖗", justify="center", width=60)
+    logo.add_column("𝖕𝖞𝕮𝖔𝖒𝖕𝖗𝖊𝖘𝖘𝖔𝖗", justify="center", width=76)
     logo.add_row("[bold blue]Fast python compressor for PDF replicas.")
     logo.add_row("[bold blue]https://n3pdf.github.io/pycompressor/")
     logo.add_row("[bold blue]© N3PDF 2021")
-    logo.add_row("[bold blue]Authors: Stefano Carrazza, Juan E. Cruz-Martinez, Tanjona R. Rabemananjara")
+    logo.add_row("[bold blue]Authors: Stefano Carrazza, Juan M. Cruz-Martinez, Tanjona R. Rabemananjara")
     console.print(logo)
 
 
@@ -67,7 +71,7 @@ def check_validity(pdfsetting, compressed, gans, est_dic):
 def check_adiabaticity(pdfsetting, gans, compressed):
     """ Check whether we are in an adiabatic optimization and if so if it can be performed """
     pdf_name = pdfsetting["pdf"]
-    if pdfsetting.get("existing_enhanced") and not gans.get("enhanced"): 
+    if pdfsetting.get("existing_enhanced") and not gans.get("enhanced"):
         adiabatic_result = f"{pdf_name}/compress_{pdf_name}_{compressed}_output.dat"
         if not pathlib.Path(adiabatic_result).exists():
             raise CheckError(
@@ -121,7 +125,6 @@ def compressing(pdfsetting, compressed, minimizer, est_dic, gans):
         postgans(str(pdf), outfolder, nbgen)
 
     splash()
-    # Set seed
     rndgen = Generator(PCG64(seed=0))
 
     console.print("\n• Load PDF sets & Printing Summary:", style="bold blue")
@@ -134,15 +137,29 @@ def compressing(pdfsetting, compressed, minimizer, est_dic, gans):
         try:
             postgan = pdf + "_enhanced"
             final_result = {"pdfset_name": postgan}
-            enhanced = PdfSet(postgan, xgrid, Q0, NF).build_pdf()
+            enhcd_grid = PdfSet(postgan, xgrid, Q0, NF).build_pdf()
+            processed, pindex, counts = preprocess_enhanced(enhcd_grid)
+            # Shuffled the enhanced PDF grid and save the shuffling
+            # index in order to restore it later.
+            shuffled_index = rndgen.choice(
+                    processed.shape[0],
+                    processed.shape[0],
+                    replace=False
+            )
+            enhanced = processed[shuffled_index]
         except RuntimeError as excp:
             raise LoadingEnhancedError(f"{excp}")
         nb_iter, ref_estimators = 100000, None
-        init_index = np.array(extract_index(pdf, compressed))
+        extr_index = np.array(extract_index(pdf, compressed))
+        map_pindex = map_index(pindex, extr_index)
+        init_index = map_index(shuffled_index, map_pindex)
+        assert extr_index.shape[0] == init_index.shape[0]
     else:
         final_result = {"pdfset_name": pdf}
         nb_iter, ref_estimators = 15000, None
         init_index, enhanced = rndindex, prior
+    # reset seeds
+    rndgen = Generator(PCG64(seed=1))
 
     # Create output folder
     outrslt = postgan if enhanced_already_exists else pdf
@@ -159,7 +176,7 @@ def compressing(pdfsetting, compressed, minimizer, est_dic, gans):
     table.add_row("PDF set name", f"{pdf}")
     table.add_row("Size of Prior", f"{prior.shape[0] - 1} replicas")
     if enhanced_already_exists:
-        table.add_row("Size of enhanced", f"{enhanced.shape[0] - 1} replicas")
+        table.add_row("Size of enhanced", f"{enhcd_grid.shape[0] - 1} replicas")
     table.add_row("Size of compression", f"{compressed} replicas")
     table.add_row("Input energy Q0", f"{Q0} GeV")
     table.add_row(
@@ -196,6 +213,10 @@ def compressing(pdfsetting, compressed, minimizer, est_dic, gans):
         erf, index = comp.cma_algorithm(std_dev=0.8)
     else:
         raise ValueError(f"{minimizer} is not a valid minimizer.")
+    # Restore the shuffled index back in case of compression from
+    # an enhanced set
+    if enhanced_already_exists:
+        index = restore_permutation(index, shuffled_index, pindex)
 
     # Prepare output file
     final_result["ERFs"] = erf_list
@@ -207,7 +228,8 @@ def compressing(pdfsetting, compressed, minimizer, est_dic, gans):
     console.print(f"\n• Final ERF: [bold red]{erf}.", style="bold red")
 
     # Compute final ERFs for the final choosen replicas
-    final_err_func = comp.final_erfs(index)
+    samples = enhcd_grid if enhanced_already_exists else enhanced
+    final_err_func = comp.final_erfs(samples, index)
     serfile = open(f"{out_folder}/erf_reduced.dat", "a+")
     serfile.write(f"{compressed}:")
     serfile.write(json.dumps(final_err_func))

diff --git a/src/pycompressor/compressor.py b/src/pycompressor/compressor.py
@@ -82,7 +82,7 @@ def all_error_function(self, index):
         erf_res = self.err_func.compute_all_erf(reduc_rep)
         return erf_res
 
-    def final_erfs(self, index):
+    def final_erfs(self, enhanced, index):
         """Compute the final ERF after minimization.
 
         Parameters
@@ -96,7 +96,7 @@ def final_erfs(self, index):
             Dictionary containing the list of estimators and their respective
             values.
         """
-        selected_replicas = self.enhanced[index]
+        selected_replicas = enhanced[index]
         erfs = self.err_func.compute_all_erf(selected_replicas)
         return erfs
 

diff --git a/src/pycompressor/errfunction.py b/src/pycompressor/errfunction.py
@@ -276,7 +276,7 @@ class ErfComputation:
         Number of trials
     """
 
-    def __init__(self, prior, est_dic, nreduc, folder, rndgen, trials=1000, norm=True):
+    def __init__(self, prior, est_dic, nreduc, folder, rndgen, trials=10000, norm=True):
         self.prior = prior
         self.est_dic = est_dic
         # Compute estimators for PRIOR replicas

diff --git a/src/pycompressor/utils.py b/src/pycompressor/utils.py
@@ -8,6 +8,77 @@
 log = logging.getLogger(__name__)
 
 
+def preprocess_enhanced(enhanced, dec_check=15):
+    """Pre-process the enhanced set by removing duplicates
+    in the PDF grid.
+
+    Parameters
+    ----------
+    enhanced: np.array(float)
+        enhanced PDF grid
+
+    Returns
+    -------
+    tuple(np.array, np.array, np.array)
+        tuple that returns the pre-processed array, the indices
+        that are kept and the number of times each array occured.
+    """
+
+    rounded = np.round(enhanced, dec_check)
+    preprocessed, index, counts = np.unique(
+            rounded,
+            axis=0,
+            return_index=True,
+            return_counts=True
+    )
+    return preprocessed, index, counts
+
+
+def map_index(refarr, arr):
+    """Map the the elements in `arr` to the index in which
+    they occur in `refarr`.
+
+    Parameters
+    ----------
+    arr: np.array(int)
+        one dimensional array of integers with size N
+    refarr: np.array(int)
+        one dimentional array of integers with size M
+
+    Returns
+    -------
+    np.array(int)
+        one dimentional array of integers with size N
+    """
+
+    inds = {e:i for i, e in enumerate(refarr)}
+    return np.vectorize(inds.get)(arr)
+
+
+def restore_permutation(index, shuffle, preprocess):
+    """Undo the maping of indices due to the preprocessing
+    and the shuffling.
+
+    Parameters
+    ----------
+    index: np.array()
+        array containing the final indices
+    shuffle: np.array(float)
+        array containing the permutation
+    preprocess: np.array(float)
+        array containing the indices of the pre-processing
+
+    Returns
+    -------
+    np.array(float)
+        array of index
+    """
+
+    undo_shuffle = shuffle[index]
+    undo_preproc = preprocess[undo_shuffle]
+    return undo_preproc
+
+
 def remap_index(index, shuffled):
     new_idx = []
     for idx in index: