Source code for cms.random_selector

"""
This modules selects random lines from the the data from
CSV files and outputs the selected lines.

See also `Random selector for CMS data <mcr_create_test_data.html>`_

"""

#  Copyright (c) 2021. Harvard University
#
#  Developed by Research Software Engineering,
#  Faculty of Arts and Sciences, Research Computing (FAS RC)
#  Author: Michael A Bouzinier
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#

import gzip
from argparse import ArgumentParser

import sys

import os
import glob
import random

from nsaph_utils.utils.io_utils import fopen


SEED = 1


[docs]def select(pattern: str, destination: str, threshold: float): files = glob.glob(pattern) random.seed(SEED) for f in files: name = os.path.basename(f) if not name.endswith('.gz'): name += ".gz" dest = os.path.basename(os.path.dirname(f)) dest = os.path.join(destination, dest) if not os.path.isdir(dest): os.makedirs(dest) dest = os.path.join(dest, name) if os.path.isfile(dest): print("Skipping: {}".format(f)) continue print("{} ==> {}".format(f, dest)) with fopen(f, "rt") as src, gzip.open(dest, "wt") as output: n1 = 0 n2 = 0 for line in src: n1 += 1 if random.random() < threshold: output.write(line) n2 += 1 print("{:d}/{:d}".format(n2, n1)) print("All Done")
[docs]def args(): """ Parses command line arguments --in INPUT pattern to select incoming files --out OUT Directory to output the random selection --selector SELECTOR A float value specifying the share of data to be selected :return: arguments as dictionary """ parser = ArgumentParser ("Random records selector") parser.add_argument("--in", help="pattern to select incoming files", default="data/*/*.csv*", dest="input", required=False) parser.add_argument("--out", help="Directory to output the random selection", default="random_data", required=False) parser.add_argument("--selector", help="A float value specifying the " + "share of data to be selected", default=0.02, type=float, required=False) arguments = parser.parse_args() return arguments
if __name__ == '__main__': arg = args() select(arg.input, arg.out, arg.selector)