wustl.cwl

  1#!/usr/bin/env cwl-runner
  2### Pipeline to ingest Monthly Pollution data downloaded from WashU Box
  3
  4#  Copyright (c) 2021-2022. Harvard University
  5#
  6#  Developed by Research Software Engineering,
  7#  Faculty of Arts and Sciences, Research Computing (FAS RC)
  8#  Author: Michael A Bouzinier
  9#
 10#  Licensed under the Apache License, Version 2.0 (the "License");
 11#  you may not use this file except in compliance with the License.
 12#  You may obtain a copy of the License at
 13#
 14#         http://www.apache.org/licenses/LICENSE-2.0
 15#
 16#  Unless required by applicable law or agreed to in writing, software
 17#  distributed under the License is distributed on an "AS IS" BASIS,
 18#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 19#  See the License for the specific language governing permissions and
 20#  limitations under the License.
 21#
 22
 23cwlVersion: v1.2
 24class: Workflow
 25
 26requirements:
 27  SubworkflowFeatureRequirement: {}
 28  StepInputExpressionRequirement: {}
 29  InlineJavascriptRequirement: {}
 30  ScatterFeatureRequirement: {}
 31  MultipleInputFeatureRequirement: {}
 32
 33
 34doc: |
 35  Workflow to aggregate pollution data coming in NetCDF format
 36  over given geographies (zip codes or counties) and ingest the
 37  aggregated data into the database
 38
 39inputs:
 40  proxy:
 41    type: string?
 42    default: ""
 43    doc: HTTP/HTTPS Proxy if required
 44  shapes:
 45    type: Directory?
 46    doc: Do we even need this parameter, as we isntead downloading shapes?
 47  shape_file_collection:
 48    type: string
 49    default: tiger
 50    doc: |
 51      [Collection of shapefiles](https://www2.census.gov/geo/tiger), 
 52      either GENZ or TIGER
 53  downloads:
 54    type: Directory
 55    doc: Directory, containing files, downloaded and unpacked from WUSTL box
 56  geography:
 57    type: string
 58    doc: |
 59      Type of geography: zip codes or counties
 60      Valid values: "zip" or "county"
 61  years:
 62    type: int[]
 63    default: [2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018]
 64  months:
 65    type: int[]
 66    default: [1,2,3,4,5,6,7,8,9,10,11,12]
 67  band:
 68    type: string
 69    default: pm25
 70  strategy:
 71    type: string
 72    default: downscale
 73    doc: "Rasterization strategy"
 74  ram:
 75    type: string
 76    default: 2GB
 77    doc: Runtime memory, available to the process
 78  database:
 79    type: File
 80    doc: Path to database connection file, usually database.ini
 81  connection_name:
 82    type: string
 83    doc: The name of the section in the database.ini file
 84
 85steps:
 86  initdb:
 87    run: initdb.cwl
 88    doc: Ensure that database utilities are at their latest version
 89    in:
 90      database: database
 91      connection_name: connection_name
 92    out:
 93      - log
 94      - err
 95
 96  make_table_name:
 97    doc: Given variable and geography type (zip/county) evaluates table name
 98    run:
 99      class: ExpressionTool
100      inputs:
101        geography:
102          type: string
103        band:
104          type: string
105      expression: "$({'table': (inputs.band + '_monthly_' + inputs.geography + '_mean')})"
106      outputs:
107        table:
108          type: string
109    in:
110      geography: geography
111      band: band
112    out: [table]
113
114  init_tables:
115    doc: creates or recreates database tables, one for each band and geography
116    run: reset.cwl
117    in:
118      domain:
119        valueFrom: "exposures"
120      database: database
121      connection_name: connection_name
122      table: make_table_name/table
123      depends_on: initdb/log
124    out:
125      - log
126      - errors
127
128  process:
129    doc: Downloads raw data and aggregates it over shapes and time
130    scatter:
131      - year
132    run: wustl_one_year.cwl
133    in:
134      proxy: proxy
135      depends_on: init_tables/log
136      downloads: downloads
137      geography: geography
138      year: years
139      months: months
140      band: band
141      strategy: strategy
142      ram: ram
143      database: database
144      connection_name: connection_name
145      table: make_table_name/table
146      shape_file_collection: shape_file_collection
147    out:
148      - aggregate_data
149      - aggregate_log
150      - aggregate_err
151      - ingest_log
152      - ingest_err
153
154  index:
155    run: index.cwl
156    in:
157      depends_on: process/ingest_log
158      domain:
159        valueFrom: "exposures"
160      table: make_table_name/table
161      database: database
162      connection_name: connection_name
163    out: [log, errors]
164
165  vacuum:
166    run: vacuum.cwl
167    in:
168      depends_on: index/log
169      domain:
170        valueFrom: "exposures"
171      table: make_table_name/table
172      database: database
173      connection_name: connection_name
174    out: [log, errors]
175
176
177outputs:
178  data:
179    type:
180      type: array
181      items:
182        type: array
183        items: [File]
184    outputSource: process/aggregate_data
185
186  aggregate_log:
187    type:
188      type: array
189      items:
190        type: array
191        items: [File]
192    outputSource: process/aggregate_log
193  aggregate_err:
194    type:
195      type: array
196      items:
197        type: array
198        items: [File]
199    outputSource: process/aggregate_err
200
201  ingest_log:
202    type:
203      type: array
204      items:
205        type: array
206        items: [File]
207    outputSource: process/ingest_log
208  ingest_err:
209    type:
210      type: array
211      items:
212        type: array
213        items: [File]
214    outputSource: process/ingest_err
215
216  reset_log:
217    type: File
218    outputSource: init_tables/log
219  reset_err:
220    type: File
221    outputSource: init_tables/errors
222
223  index_log:
224    type: File
225    outputSource: index/log
226  index_err:
227    type: File
228    outputSource: index/errors
229
230  vacuum_log:
231    type: File
232    outputSource: vacuum/log
233  vacuum_err:
234    type: File
235    outputSource: vacuum/errors