Fix actions (#15)

* feat: fetcher ability to list files from a path (for samples not published on DAS) * Print into file * run black * run black * change python version in yaml: >=3.8, <3.11 * set default mins and maxs to avoid crash * bath ends with / * example files for new fetcher * fix: actions * fix:plotting workflow Co-authored-by: Andrey Pozdnyakov <[email protected]>
cms-rwth · Jan 11, 2023 · 6765222 · 6765222
1 parent 8764034
commit 6765222
Show file tree

Hide file tree

Showing 14 changed files with 181 additions and 72 deletions.
diff --git a/.github/workflows/plotting_test.yml b/.github/workflows/plotting_test.yml
@@ -1,4 +1,4 @@
-name: plotting script test
+name: Test plotter
 on:
   push:
     branches: [ master ]
@@ -13,13 +13,17 @@ jobs:
     strategy:
       max-parallel: 4
       matrix:
-        python-version: ["3.8", "3.9", "3.10"]
+        python-version: ["3.10"]
 
     defaults:
       run:
         shell: "bash -l {0}"
 
     steps:
+    - uses: actions/checkout@v2
+    - uses: cvmfs-contrib/github-action-cvmfs@v2
+      with:
+        cvmfs_repositories: 'grid.cern.ch'
 
     - name: Set conda environment
       uses: conda-incubator/setup-miniconda@v2
@@ -38,15 +42,16 @@ jobs:
         conda info
         conda env list
         conda list
-        
-      
+            
     - name: Install Repo
       run: |
         pip install -e .
- 
+	
     - name: data/MC plotter 
       run: |
         python plotting/plotdataMC.py --cfg testfile/btv_datamc.yml 
+
     - name: comparison plotter 
       run: |
-        python plotting/comparison.py --cfg testfile/btv_comapre.yml 
+        python plotting/comparison.py --cfg testfile/btv_comapre.yml 
+    
diff --git a/README.md b/README.md
@@ -72,7 +72,7 @@ voms-proxy-init --voms cms --vomses ~/.grid-security/vomses
 Use the `./filefetcher/fetch.py` script:
 
 ```
-python filefetcher/fetch.py --input input_DAS_list.txt --output ${output_name.json}
+python filefetcher/fetch.py --input filefetcher/input_DAS_list.txt --output output_name.json
 ```
 where the `input_DAS_list.txt` is a simple file with a list of dataset names extract from DAS (you need to create it yourself for the samples you want to run over), and output json file in creted in `./metadata` directory.
 

diff --git a/env.yml b/env.yml
@@ -3,7 +3,7 @@ channels:
   - conda-forge
   - defaults
 dependencies:
-  - python=3.10
+  - python>=3.8, <3.11
   - voms
   - ca-policy-lcg
   - ca-certificates

diff --git a/filefetcher/fetch.py b/filefetcher/fetch.py
@@ -1,4 +1,5 @@
-import os
+import sys
+from os import popen, listdir, makedirs, path, system
 import json
 import argparse
 
@@ -8,11 +9,16 @@
 parser.add_argument(
     "-i",
     "--input",
-    default=r"singlemuon",
+    default=None,
+    type=str,
+    required=True,
     help="List of samples in DAS (default: %(default)s)",
 )
 parser.add_argument(
-    "-o", "--output", default=r"test_my_samples", help="Site (default: %(default)s)"
+    "-o",
+    "--output",
+    default=r"test_my_samples.json",
+    help="Site (default: %(default)s)",
 )
 parser.add_argument(
     "--xrd",
@@ -21,46 +27,144 @@
     help="xrootd prefix string (default: %(default)s)",
 )
 
+parser.add_argument(
+    "--from_path",
+    action="store_true",
+    help="For samples that are not published on DAS. If this option is set then the format of the --inpit file must be adjusted. It should be: \n dataset_name path_to_files.",
+    default=False,
+)
+
 args = parser.parse_args()
-fset = []
-
-with open(args.input) as fp:
-    lines = fp.readlines()
-    for line in lines:
-        fset.append(line)
-
-fdict = {}
-
-for dataset in fset:
-    if dataset.startswith("#") or dataset.strip() == "":
-        # print("we skip this line:", line)
-        continue
-    dsname = dataset.strip().split("/")[1]  # Dataset first name
-    Tier = dataset.strip().split("/")[
-        3
-    ]  # NANOAODSIM for regular samples, USER for private
-    instance = "prod/global"
-    if Tier == "USER":
-        instance = "prod/phys03"
-    print("Creating list of files for dataset", dsname, Tier, instance)
-    flist = (
-        os.popen(
-            (
-                "/cvmfs/cms.cern.ch/common/dasgoclient -query='instance={} file dataset={}'"
-            ).format(instance, fset[fset.index(dataset)].rstrip())
+
+
+def getFilesFromDas(args):
+    fset = []
+    with open(args.input) as fp:
+        lines = fp.readlines()
+        for line in lines:
+            fset.append(line)
+
+    fdict = {}
+
+    for dataset in fset:
+        if dataset.startswith("#") or dataset.strip() == "":
+            # print("we skip this line:", line)
+            continue
+        dsname = dataset.strip().split("/")[1]  # Dataset first name
+        Tier = dataset.strip().split("/")[
+            3
+        ]  # NANOAODSIM for regular samples, USER for private
+        instance = "prod/global"
+        if Tier == "USER":
+            instance = "prod/phys03"
+        print("Creating list of files for dataset", dsname, Tier, instance)
+        flist = (
+            popen(
+                (
+                    "/cvmfs/cms.cern.ch/common/dasgoclient -query='instance={} file dataset={}'"
+                ).format(instance, fset[fset.index(dataset)].rstrip())
+            )
+            .read()
+            .split("\n")
         )
-        .read()
-        .split("\n")
-    )
 
-    if dsname not in fdict:
-        fdict[dsname] = [args.xrd + f for f in flist if len(f) > 1]
-    else:  # needed to collect all data samples into one common key "Data" (using append() would introduce a new element for the key)
-        fdict[dsname].extend([args.xrd + f for f in flist if len(f) > 1])
+        if dsname not in fdict:
+            fdict[dsname] = [args.xrd + f for f in flist if len(f) > 1]
+        else:  # needed to collect all data samples into one common key "Data" (using append() would introduce a new element for the key)
+            fdict[dsname].extend([args.xrd + f for f in flist if len(f) > 1])
+
+    # pprint.pprint(fdict, depth=1)
+    return fdict
+
+
+def getFilesFromPath(args, lim=None):
+
+    fdict = {}
+    fset = []
+    with open(args.input) as fp:
+        lines = fp.readlines()
+        for line in lines:
+            if line.startswith("#") or line.strip() == "":
+                continue
+            if line.startswith("/"):
+                print(
+                    "You are trying to read files from path, but providing a dataset in DAS:\n",
+                    line,
+                )
+                print("That's not gonna work, so we exit here")
+                sys.exit(1)
+            ds = line.strip().split()
+            print("ds=", ds)
+            dataset = ds[0]
+            fdict[ds[0]] = getRootFilesFromPath(ds[1])
+
+    return fdict
+
+
+def getRootFilesFromPath(d, lim=None):
+
+    import subprocess
+
+    if "xrootd" in d:
+        sp = d.split("/")
+        siteIP = "/".join(sp[0:4])
+        pathToFiles = "/".join(sp[3:]) + "/"
+        allfiles = str(
+            subprocess.check_output(["xrdfs", siteIP, "ls", pathToFiles]), "utf-8"
+        ).split("\n")
+        # rootfiles = [siteIP+'/'+f for i,f in enumerate(allfiles) if f.endswith(".root") and (lim==None or i<lim)]
+    else:
+        siteIP = ""
+        pathToFiles = d
+        allfiles = [
+            path.join(d, f) for i, f in enumerate(listdir(d)) if f.endswith(".root")
+        ]
+
+    # print(siteIP, pathToFiles)
+    rootfiles = []
+    for file_or_dir in allfiles:
+        # print(file_or_dir)
+        if file_or_dir == "" or file_or_dir == pathToFiles:
+            continue
+        file_or_dir = siteIP + file_or_dir
+        if file_or_dir.endswith(".root"):
+            if lim == None or len(rootfiles) < lim:
+                rootfiles.append(file_or_dir)
+
+        elif not "log" in file_or_dir and not file_or_dir[-1] == "/":
+            file_or_dir = file_or_dir + "/"
+            print("file or dir:", file_or_dir)
+            if lim == None:
+                rootfiles.extend(getRootFilesFromPath(file_or_dir))
+            elif len(rootfiles) < lim:
+                rootfiles.extend(
+                    getRootFilesFromPath(file_or_dir, lim - len(rootfiles))
+                )
+
+    # print("Input path:", d)
+    # print("List of root files to be processed:\n",rootfiles)
+
+    return rootfiles
+
+
+def main(args):
+
+    if args.from_path:
+        print("do it from path: ")
+
+        fdict = getFilesFromPath(args)
+
+    else:
+
+        fdict = getFilesFromDas(args)
+
+    # print(fdict)
+    output_file = "./metadata/%s" % (args.output)
+    with open(output_file, "w") as fp:
+        json.dump(fdict, fp, indent=4)
+        print("The file is saved at: ", output_file)
 
-# pprint.pprint(fdict, depth=1)
 
-output_file = "./metadata/%s" % (args.output)
-with open(output_file, "w") as fp:
-    json.dump(fdict, fp, indent=4)
-    print("The file is saved at: ", output_file)
+if __name__ == "__main__":
+    print("This is the __main__ part")
+    main(args)
diff --git a/filefetcher/singlemuon → filefetcher/imput_singlemuon.txt b/filefetcher/singlemuon → filefetcher/imput_singlemuon.txt
diff --git a/filefetcher/input_DAS_list.txt b/filefetcher/input_DAS_list.txt
@@ -0,0 +1,3 @@
+/ZH_HToCC_ZToLL_M-125_TuneCP5_13TeV-powheg-pythia8/RunIISummer20UL17NanoAODv9-106X_mc2017_realistic_v9-v1/NANOAODSIM
+/DYJetsToLL_M-50_TuneCP5_13TeV-madgraphMLM-pythia8/RunIISummer20UL17NanoAODv9-106X_mc2017_realistic_v9-v1/NANOAODSIM
+/MuonEG/anovak-Run2017F-31Mar2018-v1_PFNanoAOD-132927756bb7172bff7e08e5e9b7221f/USER
diff --git a/filefetcher/input_PATH_list.txt b/filefetcher/input_PATH_list.txt
@@ -0,0 +1,2 @@
+DYJets_UNLOPS  root://grid-cms-xrootd.physik.rwth-aachen.de//store/user/andrey/NanoGEN/dyeej_UNLOPS/
+BReg_Test /net/scratch_cms3a/hcc/breg_nano17/ZH_Hbb/
diff --git a/filefetcher/list.txt b/filefetcher/list.txt
diff --git a/plotting/comparison.py b/plotting/comparison.py
@@ -133,7 +133,7 @@
     ax.set_xlabel(None)
     ax.set_ylabel("Events")
     rax.set_ylabel("Other/Ref")
-    ax.ticklabel_format(style="sci", scilimits=(-3, 3))
+    ax.ticklabel_format(style="sci", axis="y", scilimits=(-3, 3))
     ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05))
     ax.legend()
     rax.set_ylim(0.0, 2.0)
@@ -161,4 +161,4 @@
     fig.savefig(f"plot/{config['output']}_{time}/compare_{var}{logext}.png")
 
 
-print(f"The output is saved at: plot/{config['output']}_{time}")
+print(f"The output is saved at: plot/{config['output']}_{time}/")
diff --git a/runner.py b/runner.py
@@ -12,16 +12,6 @@
 
 from BTVNanoCommissioning.workflows import workflows
 
-# This would crash if the ExampleWorkflow does not exist
-from ExampleWorkflow.workflows import workflows
-from ZplusJets.workflows import workflows
-
-# from ExampleWorkflow.workflows import workflows
-# from VHcc.workflows import workflows
-# from Hpluscharm.workflows import workflows
-
-# Should come up with a smarter way to import all worflows from subdirectories of ./src/
-
 
 def validate(file):
     try:
@@ -255,13 +245,13 @@ def get_main_parser():
                 print(f"Removing: {fi}")
                 os.system(f"rm {fi}")
         if input("Write list of bad files? (y/n)") == "y":
-            corrupted_name = (args.samplejson).split('.json')[0]
-            with open(f'{corrupted_name}_corrupted.txt', 'w') as bad_txt:
+            corrupted_name = (args.samplejson).split(".json")[0]
+            with open(f"{corrupted_name}_corrupted.txt", "w") as bad_txt:
                 print("Writing:")
                 for fi in all_invalid:
                     print(f"Writing: {fi}")
                     bad_txt.write(fi)
-                    bad_txt.write('\n')
+                    bad_txt.write("\n")
         sys.exit(0)
 
     # load workflow

diff --git a/runner_wconfig.py b/runner_wconfig.py
@@ -415,5 +415,8 @@ def retry_handler(exception, task_record):
 
     save(output, config.outfile)
 
-    print(output)
+    # print(output) better to print this in a file:
+    with open(f"{config.outfile}.print.txt", "w") as f:
+        print(output, file=f)
+
     print(f"Saving output to {config.outfile}")
diff --git a/src/BTVNanoCommissioning/helpers/xsection.py b/src/BTVNanoCommissioning/helpers/xsection.py
@@ -1233,6 +1233,13 @@
         "energy": "13",
         "comment": "Private sample",
     },
+    {
+        "process_name": "DYJets_UNLOPS",
+        "cross_section": "2025",
+        "DAS": "root://grid-cms-xrootd.physik.rwth-aachen.de//store/user/andrey/NanoGEN/dyeej_UNLOPS/",
+        "energy": "13",
+        "comment": "Private sample",
+    },
     {
         "process_name": "Template",
         "cross_section": "11",

diff --git a/src/BTVNanoCommissioning/utils/plot_utils.py b/src/BTVNanoCommissioning/utils/plot_utils.py
@@ -522,6 +522,7 @@ def plotratio(
 
 def autoranger(hist):
     val, axis = hist.values(), hist.axes[-1].edges
+    mins, maxs = 0, len(val)
     for i in range(len(val)):
         if val[i] != 0:
             mins = i

diff --git a/testfile/btv_compare.yml b/testfile/btv_compare.yml
@@ -79,4 +79,4 @@ rescale_yields:
     runC : 100.0 
 inbox_text: "" #Optional, text write in anchor text box
 norm : True #Optional, normalize to reference yields
-log : True #Optional, log y axis
+log : True #Optional, log y axis
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		DYJets_UNLOPS root://grid-cms-xrootd.physik.rwth-aachen.de//store/user/andrey/NanoGEN/dyeej_UNLOPS/
		BReg_Test /net/scratch_cms3a/hcc/breg_nano17/ZH_Hbb/