aggregate_one_file.cwl

  1#!/usr/bin/env cwl-runner
  2### Workflow to aggregate and ingest NetCDF files for one year
  3#  Copyright (c) 2021-2022. Harvard University
  4#
  5#  Developed by Research Software Engineering,
  6#  Faculty of Arts and Sciences, Research Computing (FAS RC)
  7#  Author: Michael A Bouzinier
  8#
  9#  Licensed under the Apache License, Version 2.0 (the "License");
 10#  you may not use this file except in compliance with the License.
 11#  You may obtain a copy of the License at
 12#
 13#         http://www.apache.org/licenses/LICENSE-2.0
 14#
 15#  Unless required by applicable law or agreed to in writing, software
 16#  distributed under the License is distributed on an "AS IS" BASIS,
 17#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 18#  See the License for the specific language governing permissions and
 19#  limitations under the License.
 20#
 21
 22cwlVersion: v1.2
 23class: Workflow
 24
 25requirements:
 26  SubworkflowFeatureRequirement: {}
 27  StepInputExpressionRequirement: {}
 28  InlineJavascriptRequirement: {}
 29  ScatterFeatureRequirement: {}
 30  MultipleInputFeatureRequirement: {}
 31
 32
 33doc: |
 34  Sub-workflow to aggregate a NetCDF file for one year over a given
 35  geography (zip codes or counties). Before aggregation, downloads
 36  shape files fo this year from US Census website
 37
 38inputs:
 39  depends_on:
 40    type: Any?
 41  proxy:
 42    type: string?
 43    default: ""
 44    doc: HTTP/HTTPS Proxy if required
 45  downloads:
 46    type: Directory
 47  geography:
 48    type: string
 49  variable:
 50    type: string
 51  component:
 52    type: string[]
 53  year:
 54    type: int
 55  strategy:
 56    type: string
 57    doc: "Rasterization strategy"
 58  ram:
 59    type: string
 60    default: 2GB
 61    doc: Runtime memory, available to the process
 62  shape_file_collection:
 63    type: string
 64    default: tiger
 65    doc: |
 66      [Collection of shapefiles](https://www2.census.gov/geo/tiger), 
 67      either GENZ or TIGER
 68  table:
 69    type: string?
 70    doc: |
 71      Optional name ot the table where the aggregated data will be
 72      eventually stored
 73
 74steps:
 75  get_shapes:
 76    run: get_shapes.cwl
 77    doc: |
 78      This step downloads Shape files from a given collection (TIGER/Line or GENZ) 
 79      and a geography (ZCTA or Counties) from the US Census website,
 80      for a given year or for the closest one.
 81
 82    in:
 83      year:
 84        valueFrom: $(String(inputs.yy))
 85      yy: year
 86      geo: geography
 87      collection: shape_file_collection
 88      proxy: proxy
 89    out:
 90      - shape_files
 91
 92  find_pm25_file:
 93    doc: |
 94      Given input directory, variable (band), year and month,
 95      evaluates the expected file name for the main variable input data
 96    run:  wustl_file_pattern.cwl
 97    in:
 98      year: year
 99      variables:
100        valueFrom: $([inputs.variable])
101      variable: variable
102      downloads: downloads
103    out: [netcdf_files]
104
105  find_components_files:
106    doc: |
107      Given input directory, variable (band), year and month,
108      evaluates the expected file name for the main variable input data
109    run:  wustl_file_pattern.cwl
110    in:
111      year: year
112      variables: component
113      downloads: downloads
114    out: [netcdf_files]
115
116  consolidate:
117    doc: consolidate components into one file
118    run: wustl_consolidate_components.cwl
119    in:
120      abs_values: find_pm25_file/netcdf_files
121      components: find_components_files/netcdf_files
122    out:
123      - consolidated_data
124
125  aggregate:
126    doc: Aggregate data over geographies
127    run: aggregate_wustl.cwl
128    in:
129      strategy: strategy
130      ram: ram
131      geography: geography
132      netcdf_data: consolidate/consolidated_data
133      shape_files: get_shapes/shape_files
134      variable: variable
135      components: component
136      table: table
137      band:
138        valueFrom: $([inputs.variable].concat(inputs.components))
139    out:
140      - log
141      - errors
142      - csv_data
143      - data_dictionary
144
145outputs:
146  shapes:
147    type: File[]
148    outputSource: get_shapes/shape_files
149
150  consolidated_data:
151    type: File
152    outputSource: consolidate/consolidated_data
153  aggregate_data:
154    type: File
155    outputSource: aggregate/csv_data
156  data_dictionary:
157    type: File?
158    outputSource: aggregate/data_dictionary
159  aggregate_log:
160    type: File?
161    outputSource: aggregate/log
162  aggregate_err:
163    type: File
164    outputSource: aggregate/errors