1#!/usr/bin/env cwl-runner
2### Pipeline to ingest Monthly Pollution data downloaded from WashU Box
3
4# Copyright (c) 2021-2022. Harvard University
5#
6# Developed by Research Software Engineering,
7# Faculty of Arts and Sciences, Research Computing (FAS RC)
8# Author: Michael A Bouzinier
9#
10# Licensed under the Apache License, Version 2.0 (the "License");
11# you may not use this file except in compliance with the License.
12# You may obtain a copy of the License at
13#
14# http://www.apache.org/licenses/LICENSE-2.0
15#
16# Unless required by applicable law or agreed to in writing, software
17# distributed under the License is distributed on an "AS IS" BASIS,
18# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19# See the License for the specific language governing permissions and
20# limitations under the License.
21#
22
23cwlVersion: v1.2
24class: Workflow
25
26requirements:
27 SubworkflowFeatureRequirement: {}
28 StepInputExpressionRequirement: {}
29 InlineJavascriptRequirement: {}
30 ScatterFeatureRequirement: {}
31 MultipleInputFeatureRequirement: {}
32
33
34doc: |
35 Workflow to aggregate pollution data coming in NetCDF format
36 over given geographies (zip codes or counties) and ingest the
37 aggregated data into the database
38
39inputs:
40 proxy:
41 type: string?
42 default: ""
43 doc: HTTP/HTTPS Proxy if required
44 shapes:
45 type: Directory?
46 doc: Do we even need this parameter, as we isntead downloading shapes?
47 shape_file_collection:
48 type: string
49 default: tiger
50 doc: |
51 [Collection of shapefiles](https://www2.census.gov/geo/tiger),
52 either GENZ or TIGER
53 downloads:
54 type: Directory
55 doc: Directory, containing files, downloaded and unpacked from WUSTL box
56 geography:
57 type: string
58 doc: |
59 Type of geography: zip codes or counties
60 Valid values: "zip" or "county"
61 years:
62 type: int[]
63 default: [2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018]
64 months:
65 type: int[]
66 default: [1,2,3,4,5,6,7,8,9,10,11,12]
67 band:
68 type: string
69 default: pm25
70 strategy:
71 type: string
72 default: downscale
73 doc: "Rasterization strategy"
74 ram:
75 type: string
76 default: 2GB
77 doc: Runtime memory, available to the process
78 database:
79 type: File
80 doc: Path to database connection file, usually database.ini
81 connection_name:
82 type: string
83 doc: The name of the section in the database.ini file
84
85steps:
86 initdb:
87 run: initdb.cwl
88 doc: Ensure that database utilities are at their latest version
89 in:
90 database: database
91 connection_name: connection_name
92 out:
93 - log
94 - err
95
96 make_table_name:
97 doc: Given variable and geography type (zip/county) evaluates table name
98 run:
99 class: ExpressionTool
100 inputs:
101 geography:
102 type: string
103 band:
104 type: string
105 expression: "$({'table': (inputs.band + '_monthly_' + inputs.geography + '_mean')})"
106 outputs:
107 table:
108 type: string
109 in:
110 geography: geography
111 band: band
112 out: [table]
113
114 init_tables:
115 doc: creates or recreates database tables, one for each band and geography
116 run: reset.cwl
117 in:
118 domain:
119 valueFrom: "exposures"
120 database: database
121 connection_name: connection_name
122 table: make_table_name/table
123 depends_on: initdb/log
124 out:
125 - log
126 - errors
127
128 process:
129 doc: Downloads raw data and aggregates it over shapes and time
130 scatter:
131 - year
132 run: wustl_one_year.cwl
133 in:
134 proxy: proxy
135 depends_on: init_tables/log
136 downloads: downloads
137 geography: geography
138 year: years
139 months: months
140 band: band
141 strategy: strategy
142 ram: ram
143 database: database
144 connection_name: connection_name
145 table: make_table_name/table
146 shape_file_collection: shape_file_collection
147 out:
148 - aggregate_data
149 - aggregate_log
150 - aggregate_err
151 - ingest_log
152 - ingest_err
153
154 index:
155 run: index.cwl
156 in:
157 depends_on: process/ingest_log
158 domain:
159 valueFrom: "exposures"
160 table: make_table_name/table
161 database: database
162 connection_name: connection_name
163 out: [log, errors]
164
165 vacuum:
166 run: vacuum.cwl
167 in:
168 depends_on: index/log
169 domain:
170 valueFrom: "exposures"
171 table: make_table_name/table
172 database: database
173 connection_name: connection_name
174 out: [log, errors]
175
176
177outputs:
178 data:
179 type:
180 type: array
181 items:
182 type: array
183 items: [File]
184 outputSource: process/aggregate_data
185
186 aggregate_log:
187 type:
188 type: array
189 items:
190 type: array
191 items: [File]
192 outputSource: process/aggregate_log
193 aggregate_err:
194 type:
195 type: array
196 items:
197 type: array
198 items: [File]
199 outputSource: process/aggregate_err
200
201 ingest_log:
202 type:
203 type: array
204 items:
205 type: array
206 items: [File]
207 outputSource: process/ingest_log
208 ingest_err:
209 type:
210 type: array
211 items:
212 type: array
213 items: [File]
214 outputSource: process/ingest_err
215
216 reset_log:
217 type: File
218 outputSource: init_tables/log
219 reset_err:
220 type: File
221 outputSource: init_tables/errors
222
223 index_log:
224 type: File
225 outputSource: index/log
226 index_err:
227 type: File
228 outputSource: index/errors
229
230 vacuum_log:
231 type: File
232 outputSource: vacuum/log
233 vacuum_err:
234 type: File
235 outputSource: vacuum/errors