#!/usr/bin/env python

'''
This Python script concatenates all of the netCDF
files for individual species in the BB4CMIP6/v2019-06/YYYY subfolders
into one netCDF file per year, named 
"BB4CMIP6/v2019-06/BB4CMIP6-Biomass_025x025_YYYY.nc".  This is more
efficient becaue it minimizes the number of times a netCDF file has
to be opened.

Calling sequence:
    ./concatentate_files.py
'''

# Imports
from os.path import join
import xarray as xr
from xarray.coding.variables import SerializationWarning
import numpy as np
import warnings

# Suppress harmless run-time warnings (mostly about underflow in division)
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=SerializationWarning)

# Main path (edit as necessary)
maindir = '/n/scratchlfs/jacob_lab/ryantosca/data/BB4CMIP6/v2019-06'

# Loop over years
for year in range(1800,1850):
    print('Now processing {}'.format(year))

    # Open all files in each yearly subfolder into a single Dataset
    infiles = join(maindir, str(year), '*.nc')
    ds = xr.open_mfdataset(infiles) 

    # Keep all DataArray attributes
    with xr.set_options(keep_attrs=True):
    
        # Loop over all variables
        for v in ds.data_vars.keys():
        
            # Xarray will try convert missing values to NaN's,
            # so we need to replace these with zeroes
            ds[v].where(np.isnan(ds[v].values), other=0.0, drop=False)
        
            # Debug print
            #print('{} : {} {}'.format(
            #    v, np.min(ds[v].values), np.max(ds[v].values)))

    # Write to the output file to the main path
    outfile = 'BB4CMIP6-Biomass_025x025_{}.nc'.format(year)
    outpath= join(maindir, outfile)
    ds.to_netcdf(outpath)