#!/usr/bin/env python """ Script to combine data from the various CMIP6 files into one file per year, containing all species. Reading one file per year instead of one file per species per year is more computationally efficient. NOTE: Some post-processing is required, to change the unit string to "days since YYYY-01-01 00:00:00" and to chunk and compress the data. Use script post_process_data.sh to do this. Bob Yantosca (yantosca@seas.harvard.edu), 18 Dec 2019 """ # Imports from os.path import join import numpy as np import xarray as xr # Main data directory (edit for your system) rootdir = "/n/holylfs/EXTERNAL_REPOS/GEOS-CHEM/gcgrid/gcdata/ExtData/HEMCO/CMIP6/v2019-09" # Data files files = [ join(rootdir, "CMIP6_GHG_surface_VMR_1750_2014_for_CH2Cl2.nc"), join(rootdir, "CMIP6_GHG_surface_VMR_1750_2014_for_CH3Br.nc"), join(rootdir, "CMIP6_GHG_surface_VMR_1750_2014_for_CH3Cl.nc"), join(rootdir, "CMIP6_GHG_surface_VMR_1750_2014_for_CH4.nc"), join(rootdir, "CMIP6_GHG_surface_VMR_1750_2014_for_CHCl3.nc") ] # Read everything into one dataset ds = xr.open_mfdataset(files) # Set the encoding types for the lon, lat, time fields encoding={"time": {"dtype": "double"}, "lat": {"dtype": "double"}, "lon": {"dtype": "double"}} # Loop over years for year in range(1750, 2015): print("Processing year {}".format(year)) with xr.set_options(keep_attrs=True): # Date as of Jan 1st of this year (as a np.datetime64[D] object) date_Jan1 = np.datetime64("{}-01-01".format(year), 'D') # From the combined dataset, pick only data for this year ds2 = ds.sel(time=slice("{}-01-16".format(year), "{}-12-16".format(year))) # Adjsut the time units to be days since Jan 1st of this year ds2["time"] = ds2["time"].astype('datetime64[D]') - date_Jan1 # Write data for this year to its own file ds2.to_netcdf("CMIP6_GHG_surface_VMR_{}.nc".format(year), encoding=encoding)