1#!/usr/bin/env cwl-runner
2### Pipeline to aggregate data from Climatology Lab
3# Copyright (c) 2021-2022. Harvard University
4#
5# Developed by Research Software Engineering,
6# Faculty of Arts and Sciences, Research Computing (FAS RC)
7# Author: Michael A Bouzinier
8#
9# Licensed under the Apache License, Version 2.0 (the "License");
10# you may not use this file except in compliance with the License.
11# You may obtain a copy of the License at
12#
13# http://www.apache.org/licenses/LICENSE-2.0
14#
15# Unless required by applicable law or agreed to in writing, software
16# distributed under the License is distributed on an "AS IS" BASIS,
17# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18# See the License for the specific language governing permissions and
19# limitations under the License.
20#
21
22cwlVersion: v1.2
23class: Workflow
24
25requirements:
26 SubworkflowFeatureRequirement: {}
27 StepInputExpressionRequirement: {}
28 InlineJavascriptRequirement: {}
29 ScatterFeatureRequirement: {}
30 MultipleInputFeatureRequirement: {}
31
32
33doc: |
34 This workflow downloads NetCDF datasets from
35 [University of Idaho Gridded Surface Meteorological Dataset](https://www.northwestknowledge.net/metdata/data/),
36 aggregates gridded data to daily mean values over chosen geographies
37 and optionally ingests it into the database.
38
39 The output of the workflow are gzipped CSV files containing
40 aggregated data.
41
42 Optionally, the aggregated data can be ingested into a database
43 specified in the connection parameters:
44
45 * `database.ini` file containing connection descriptions
46 * `connection_name` a string referring to a section in the `database.ini`
47 file, identifying specific connection to be used.
48
49 The workflow can be invoked either by providing command line options
50 as in the following example:
51
52 toil-cwl-runner --retryCount 1 --cleanWorkDir never \
53 --outdir /scratch/work/exposures/outputs \
54 --workDir /scratch/work/exposures \
55 gridmet.cwl \
56 --database /opt/local/database.ini \
57 --connection_name dorieh \
58 --bands rmin rmax \
59 --strategy auto \
60 --geography zcta \
61 --ram 8GB
62
63 Or, by providing a YaML file (see [example](jobs/test_gridmet_job))
64 with similar options:
65
66 toil-cwl-runner --retryCount 1 --cleanWorkDir never \
67 --outdir /scratch/work/exposures/outputs \
68 --workDir /scratch/work/exposures \
69 gridmet.cwl test_gridmet_job.yml
70
71
72inputs:
73 proxy:
74 type: string?
75 default: ""
76 doc: HTTP/HTTPS Proxy if required
77 shapes:
78 type: Directory?
79 doc: Do we even need this parameter, as we instead downloading shapes?
80 geography:
81 type: string
82 doc: |
83 Type of geography: zip codes or counties
84 Valid values: "zip", "zcta" or "county"
85 years:
86 type: string[]
87 default: ['1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']
88 bands:
89 doc: |
90 University of Idaho Gridded Surface Meteorological Dataset
91 [bands](https://developers.google.com/earth-engine/datasets/catalog/IDAHO_EPSCOR_GRIDMET#bands)
92 type: string[]
93 # default: ['bi', 'erc', 'etr', 'fm100', 'fm1000', 'pet', 'pr', 'rmax', 'rmin', 'sph', 'srad', 'th', 'tmmn', 'tmmx', 'vpd', 'vs']
94 strategy:
95 type: string
96 default: auto
97 doc: |
98 [Rasterization strategy](https://nsaph-data-platform.github.io/nsaph-platform-docs/common/gridmet/doc/strategy.html)
99 used for spatial aggregation
100 ram:
101 type: string
102 default: 2GB
103 doc: |
104 Runtime memory, available to the process. When aggregation
105 strategy is `auto`, this value is used to calculate the optimal
106 downscaling factor for the available resources.
107 database:
108 type: File
109 doc: Path to database connection file, usually database.ini
110 connection_name:
111 type: string
112 doc: The name of the section in the database.ini file
113 dates:
114 type: string?
115 doc: 'dates restriction, for testing purposes only'
116 domain:
117 type: string
118 default: climate
119
120
121steps:
122 init_db_schema:
123 doc: We need to do it because of parallel creation of tables
124 run:
125 class: CommandLineTool
126 baseCommand: [python, -m, dorieh.platform.util.psql]
127 doc: |
128 This tool executes an SQL statement in the database to grant
129 read privileges to NSAPH users (memebrs of group nsaph_admin)
130 inputs:
131 database:
132 type: File
133 doc: Path to database connection file, usually database.ini
134 inputBinding:
135 prefix: --db
136 connection_name:
137 type: string
138 doc: The name of the section in the database.ini file
139 inputBinding:
140 prefix: --connection
141 domain:
142 type: string
143 #default: climate
144 arguments:
145 - valueFrom: $("CREATE SCHEMA IF NOT EXISTS " + inputs.domain + ';')
146 position: 3
147 outputs:
148 log:
149 type: stdout
150 err:
151 type: stderr
152 stderr: "schema.err"
153 stdout: "schema.log"
154 in:
155 database: database
156 connection_name: connection_name
157 domain: domain
158 out:
159 - log
160 - err
161
162 make_registry:
163 run: build_gridmet_model.cwl
164 doc: Writes down YAML file with the database model
165 in:
166 depends_on: init_db_schema/log
167 domain: domain
168 out:
169 - model
170 - log
171 - errors
172
173 init_tables:
174 doc: creates or recreates database tables, one for each band
175 scatter:
176 - band
177 run:
178 class: Workflow
179 inputs:
180 registry:
181 type: File
182 table:
183 type: string
184 domain:
185 type: string
186 database:
187 type: File
188 connection_name:
189 type: string
190 steps:
191 reset:
192 run: reset.cwl
193 in:
194 registry: registry
195 domain: domain
196 database: database
197 connection_name: connection_name
198 table: table
199 out:
200 - log
201 - errors
202 index:
203 run: index.cwl
204 in:
205 depends_on: reset/log
206 registry: registry
207 domain: domain
208 table: table
209 database: database
210 connection_name: connection_name
211 out: [log, errors]
212 outputs:
213 reset_log:
214 type: File
215 outputSource: reset/log
216 reset_err:
217 type: File
218 outputSource: reset/errors
219 index_log:
220 type: File
221 outputSource: index/log
222 index_err:
223 type: File
224 outputSource: index/errors
225 in:
226 registry: make_registry/model
227 database: database
228 connection_name: connection_name
229 band: bands
230 geography: geography
231 domain: domain
232 table:
233 valueFrom: $(inputs.geography + '_' + inputs.band)
234 out:
235 - reset_log
236 - reset_err
237 - index_log
238 - index_err
239
240 process:
241 run: gridmet_one_file.cwl
242 doc: Downloads raw data and aggregates it over shapes and time
243 scatter:
244 - band
245 - year
246 scatterMethod: nested_crossproduct
247
248 in:
249 proxy: proxy
250 depends_on: init_tables/index_log
251 model: make_registry/model
252 shapes: shapes
253 geography: geography
254 strategy: strategy
255 ram: ram
256 year: years
257 dates: dates
258 band: bands
259 database: database
260 connection_name: connection_name
261 domain: domain
262 months:
263 valueFrom: $([1,2,3,4,5,6,7,8,9,10,11,12])
264 table:
265 valueFrom: $(inputs.geography + '_' + inputs.band)
266
267 out:
268 - download_log
269 - download_err
270 - add_data_aggregate_errors
271 - add_data_data
272 - add_data_aggregate_log
273 - add_data_ingest_log
274 - add_data_ingest_errors
275 - vacuum_log
276 - vacuum_err
277
278outputs:
279 registry:
280 type: File?
281 outputSource: make_registry/model
282 registry_log:
283 type: File?
284 outputSource: make_registry/log
285 registry_err:
286 type: File?
287 outputSource: make_registry/errors
288
289 data:
290 type:
291 type: array
292 items:
293 type: array
294 items:
295 type: array
296 items: [File]
297 outputSource: process/add_data_data
298 download_log:
299 type:
300 type: array
301 items:
302 type: array
303 items: [File]
304 outputSource: process/download_log
305 download_err:
306 type:
307 type: array
308 items:
309 type: array
310 items: [File]
311 outputSource: process/download_err
312
313 process_log:
314 type:
315 type: array
316 items:
317 type: array
318 items:
319 type: array
320 items: [File]
321 outputSource: process/add_data_aggregate_log
322 process_err:
323 type:
324 type: array
325 items:
326 type: array
327 items:
328 type: array
329 items: [File]
330 outputSource: process/add_data_aggregate_errors
331
332 ingest_log:
333 type:
334 type: array
335 items:
336 type: array
337 items:
338 type: array
339 items: [File]
340 outputSource: process/add_data_ingest_log
341 ingest_err:
342 type:
343 type: array
344 items:
345 type: array
346 items:
347 type: array
348 items: [File]
349 outputSource: process/add_data_ingest_errors
350
351 reset_log:
352 type:
353 type: array
354 items: [File]
355 outputSource: init_tables/reset_log
356 reset_err:
357 type:
358 type: array
359 items: [File]
360 outputSource: init_tables/reset_err
361
362 index_log:
363 type:
364 type: array
365 items: [File]
366 outputSource: init_tables/index_log
367 index_err:
368 type:
369 type: array
370 items: [File]
371 outputSource: init_tables/index_err
372
373 vacuum_log:
374 type:
375 type: array
376 items:
377 type: array
378 items: [File]
379 outputSource: process/vacuum_log
380 vacuum_err:
381 type:
382 type: array
383 items:
384 type: array
385 items: [File]
386 outputSource: process/vacuum_err