gridmet.cwl

  1#!/usr/bin/env cwl-runner
  2### Pipeline to aggregate data from Climatology Lab
  3#  Copyright (c) 2021-2022. Harvard University
  4#
  5#  Developed by Research Software Engineering,
  6#  Faculty of Arts and Sciences, Research Computing (FAS RC)
  7#  Author: Michael A Bouzinier
  8#
  9#  Licensed under the Apache License, Version 2.0 (the "License");
 10#  you may not use this file except in compliance with the License.
 11#  You may obtain a copy of the License at
 12#
 13#         http://www.apache.org/licenses/LICENSE-2.0
 14#
 15#  Unless required by applicable law or agreed to in writing, software
 16#  distributed under the License is distributed on an "AS IS" BASIS,
 17#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 18#  See the License for the specific language governing permissions and
 19#  limitations under the License.
 20#
 21
 22cwlVersion: v1.2
 23class: Workflow
 24
 25requirements:
 26  SubworkflowFeatureRequirement: {}
 27  StepInputExpressionRequirement: {}
 28  InlineJavascriptRequirement: {}
 29  ScatterFeatureRequirement: {}
 30  MultipleInputFeatureRequirement: {}
 31
 32
 33doc: |
 34  This workflow downloads NetCDF datasets from 
 35  [University of Idaho Gridded Surface Meteorological Dataset](https://www.northwestknowledge.net/metdata/data/), 
 36  aggregates gridded data to daily mean values over chosen geographies
 37  and optionally ingests it into the database.
 38  
 39  The output of the workflow are gzipped CSV files containing
 40  aggregated data. 
 41  
 42  Optionally, the aggregated data can be ingested into a database
 43  specified in the connection parameters:
 44  
 45  * `database.ini` file containing connection descriptions
 46  * `connection_name` a string referring to a section in the `database.ini`
 47     file, identifying specific connection to be used.
 48
 49  The workflow can be invoked either by providing command line options 
 50  as in the following example:
 51  
 52      toil-cwl-runner --retryCount 1 --cleanWorkDir never \ 
 53          --outdir /scratch/work/exposures/outputs \ 
 54          --workDir /scratch/work/exposures \
 55          gridmet.cwl \  
 56          --database /opt/local/database.ini \ 
 57          --connection_name dorieh \ 
 58          --bands rmin rmax \ 
 59          --strategy auto \ 
 60          --geography zcta \ 
 61          --ram 8GB
 62
 63  Or, by providing a YaML file (see [example](jobs/test_gridmet_job)) 
 64  with similar options:
 65  
 66      toil-cwl-runner --retryCount 1 --cleanWorkDir never \ 
 67          --outdir /scratch/work/exposures/outputs \ 
 68          --workDir /scratch/work/exposures \
 69          gridmet.cwl test_gridmet_job.yml 
 70  
 71
 72inputs:
 73  proxy:
 74    type: string?
 75    default: ""
 76    doc: HTTP/HTTPS Proxy if required
 77  shapes:
 78    type: Directory?
 79    doc: Do we even need this parameter, as we instead downloading shapes?
 80  geography:
 81    type: string
 82    doc: |
 83      Type of geography: zip codes or counties
 84      Valid values: "zip", "zcta" or "county"
 85  years:
 86    type: string[]
 87    default: ['1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']
 88  bands:
 89    doc: |
 90      University of Idaho Gridded Surface Meteorological Dataset 
 91      [bands](https://developers.google.com/earth-engine/datasets/catalog/IDAHO_EPSCOR_GRIDMET#bands)
 92    type: string[]
 93    # default: ['bi', 'erc', 'etr', 'fm100', 'fm1000', 'pet', 'pr', 'rmax', 'rmin', 'sph', 'srad', 'th', 'tmmn', 'tmmx', 'vpd', 'vs']
 94  strategy:
 95    type: string
 96    default: auto
 97    doc: |
 98      [Rasterization strategy](https://nsaph-data-platform.github.io/nsaph-platform-docs/common/gridmet/doc/strategy.html)
 99      used for spatial aggregation
100  ram:
101    type: string
102    default: 2GB
103    doc: |
104      Runtime memory, available to the process. When aggregation
105      strategy is `auto`, this value is used to calculate the optimal
106      downscaling factor for the available resources. 
107  database:
108    type: File
109    doc: Path to database connection file, usually database.ini
110  connection_name:
111    type: string
112    doc: The name of the section in the database.ini file
113  dates:
114    type: string?
115    doc: 'dates restriction, for testing purposes only'
116  domain:
117    type: string
118    default: climate
119
120
121steps:
122  init_db_schema:
123    doc: We need to do it because of parallel creation of tables
124    run:
125      class: CommandLineTool
126      baseCommand: [python, -m, dorieh.platform.util.psql]
127      doc: |
128        This tool executes an SQL statement in the database to grant
129        read privileges to NSAPH users (memebrs of group nsaph_admin)
130      inputs:
131        database:
132          type: File
133          doc: Path to database connection file, usually database.ini
134          inputBinding:
135            prefix: --db
136        connection_name:
137          type: string
138          doc: The name of the section in the database.ini file
139          inputBinding:
140            prefix: --connection
141        domain:
142          type: string
143          #default: climate
144      arguments:
145        - valueFrom: $("CREATE SCHEMA IF NOT EXISTS " + inputs.domain + ';')
146          position: 3
147      outputs:
148        log:
149          type: stdout
150        err:
151          type: stderr
152      stderr: "schema.err"
153      stdout: "schema.log"
154    in:
155      database: database
156      connection_name: connection_name
157      domain: domain
158    out:
159      - log
160      - err
161
162  make_registry:
163    run: build_gridmet_model.cwl
164    doc: Writes down YAML file with the database model
165    in:
166      depends_on: init_db_schema/log
167      domain: domain
168    out:
169      - model
170      - log
171      - errors
172
173  init_tables:
174    doc: creates or recreates database tables, one for each band
175    scatter:
176      - band
177    run:
178      class: Workflow
179      inputs:
180        registry:
181          type: File
182        table:
183          type: string
184        domain:
185          type: string
186        database:
187          type: File
188        connection_name:
189          type: string
190      steps:
191        reset:
192          run: reset.cwl
193          in:
194            registry:  registry
195            domain: domain
196            database: database
197            connection_name: connection_name
198            table: table
199          out:
200            - log
201            - errors
202        index:
203          run: index.cwl
204          in:
205            depends_on: reset/log
206            registry: registry
207            domain: domain
208            table: table
209            database: database
210            connection_name: connection_name
211          out: [log, errors]
212      outputs:
213        reset_log:
214          type: File
215          outputSource: reset/log
216        reset_err:
217          type: File
218          outputSource: reset/errors
219        index_log:
220          type: File
221          outputSource: index/log
222        index_err:
223          type: File
224          outputSource: index/errors
225    in:
226      registry:  make_registry/model
227      database: database
228      connection_name: connection_name
229      band: bands
230      geography: geography
231      domain: domain
232      table:
233        valueFrom: $(inputs.geography + '_' + inputs.band)
234    out:
235      - reset_log
236      - reset_err
237      - index_log
238      - index_err
239
240  process:
241    run: gridmet_one_file.cwl
242    doc: Downloads raw data and aggregates it over shapes and time
243    scatter:
244      - band
245      - year
246    scatterMethod: nested_crossproduct
247
248    in:
249      proxy: proxy
250      depends_on: init_tables/index_log
251      model: make_registry/model
252      shapes: shapes
253      geography: geography
254      strategy: strategy
255      ram: ram
256      year: years
257      dates: dates
258      band: bands
259      database: database
260      connection_name: connection_name
261      domain: domain
262      months:
263        valueFrom: $([1,2,3,4,5,6,7,8,9,10,11,12])
264      table:
265        valueFrom: $(inputs.geography + '_' + inputs.band)
266
267    out:
268      - download_log
269      - download_err
270      - add_data_aggregate_errors
271      - add_data_data
272      - add_data_aggregate_log
273      - add_data_ingest_log
274      - add_data_ingest_errors
275      - vacuum_log
276      - vacuum_err
277
278outputs:
279  registry:
280    type: File?
281    outputSource: make_registry/model
282  registry_log:
283    type: File?
284    outputSource: make_registry/log
285  registry_err:
286    type: File?
287    outputSource: make_registry/errors
288
289  data:
290    type:
291      type: array
292      items:
293        type: array
294        items:
295          type: array
296          items: [File]
297    outputSource: process/add_data_data
298  download_log:
299    type:
300      type: array
301      items:
302        type: array
303        items: [File]
304    outputSource: process/download_log
305  download_err:
306    type:
307      type: array
308      items:
309        type: array
310        items: [File]
311    outputSource: process/download_err
312
313  process_log:
314    type:
315      type: array
316      items:
317        type: array
318        items:
319          type: array
320          items: [File]
321    outputSource: process/add_data_aggregate_log
322  process_err:
323    type:
324      type: array
325      items:
326        type: array
327        items:
328          type: array
329          items: [File]
330    outputSource: process/add_data_aggregate_errors
331
332  ingest_log:
333    type:
334      type: array
335      items:
336        type: array
337        items:
338          type: array
339          items: [File]
340    outputSource: process/add_data_ingest_log
341  ingest_err:
342    type:
343      type: array
344      items:
345        type: array
346        items:
347          type: array
348          items: [File]
349    outputSource: process/add_data_ingest_errors
350
351  reset_log:
352    type:
353      type: array
354      items: [File]
355    outputSource: init_tables/reset_log
356  reset_err:
357    type:
358      type: array
359      items: [File]
360    outputSource: init_tables/reset_err
361
362  index_log:
363    type:
364      type: array
365      items: [File]
366    outputSource: init_tables/index_log
367  index_err:
368    type:
369      type: array
370      items: [File]
371    outputSource: init_tables/index_err
372
373  vacuum_log:
374    type:
375      type: array
376      items:
377        type: array
378        items: [File]
379    outputSource: process/vacuum_log
380  vacuum_err:
381    type:
382      type: array
383      items:
384        type: array
385        items: [File]
386    outputSource: process/vacuum_err