# -*- coding: utf-8 -*-
"""GriddedEpa2GC.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1Z7Btq_upCyCcHWaKk-NFhS9P_Rbiddne

# Preparation of GEOS-Chem Emissions from CMAQ

    author: Barron H. Henderson
    contributors: Lyssa Freese. Adding names pending approval...
    date: 2020-06-04

# Overview

This notebook takes SMOKE outputs, which are CMAQ-ready emissions and converts them for use in GEOS-Chem. The tutorial uses "gridded reports", which contain annual data.  To make this operational, you instead repeat the process for each month.

Steps:
1. install libraries
2. Download data from EPA
3. Regrid and convert to fluxes in GEOS-Chem format

# Install Libraries

System Package | Use                   | Python Package | Use
---------------|-----------------------| ---------------|---------
libgeos-dev    | Geospatial processing | basemap        | Mapping
cdo            | Conservative regrid   | PseudoNetCDF   | IOAPI-like support


* Run the next three cells
"""

!apt-get install -qq libgeos-dev

# Commented out IPython magic to ensure Python compatibility.
# %%capture
# !apt-get install -qq cdo

!pip install -q https://github.com/matplotlib/basemap/archive/master.zip
!pip install -q https://github.com/barronh/pseudonetcdf/archive/master.zip

"""# Restart the Runtime

  * choose the "Runtime" menu, and
  * then "Restart Runtime"

# Import Libraries and Set Earth Radius

* Importing necessary libraries
* Setting IOAPI earth radius to WRF radius
"""

import os
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mc
import PseudoNetCDF as pnc
from calendar import monthrange
import pandas as pd
os.environ['IOAPI_ISPH'] = '6370000.'

"""# Download Files from EPA's newftp

* Downloading inline and gridded anthropogenic emissions
* Downloading biogenic and non-US sectors separately

Note: if failing, remove `-q` to see errros.
"""

areaonly_sectors = sectors = [
  'afdust_adj', 'ag', 'airports',
  'nonpt', 'nonroad', 'np_oilgas', 'onroad', 'onroad_ca_adj', 'rail', 'rwc', # domestic
]
intl_sectors = [
  'onroad_can', 'onroad_mex', 'othafdust_adj', 'othar', 'othptdust_adj', 'emln_othpt', # other countries (Can/Mex)
]

natural_sectors = [
  'ocean_cl2', 'beis',
  'emln_ptagfire', 'emln_ptfire', 'emln_ptfire_othna' # fires include some anthro
]
merged_sectors = [
  'mrggrid_nobeis_norwc', # sum of areaonly_sectors and allinln - partinln sectors
]

ptonly_sectors = ['emln_cmv_c1c2_12', 'emln_cmv_c3_12', 'emln_ptegu']

allinln_sectors = [
  'emln_pt_oilgas_allinln',
  'emln_ptnonipm_allinln',
]

partinln_sectors = [
  'emln_pt_oilgas',
  'emln_ptnonipm',
]

download_sectors = areaonly_sectors + ptonly_sectors + allinln_sectors + partinln_sectors + merged_sectors + intl_sectors + natural_sectors

# Not including natural because it would duplicate
# GEOS-Chem MEGAN, ocean, and fires
# Should cmv_c1_c2 be in? cmv_c3?
# airports?
include_sectors = areaonly_sectors + ptonly_sectors + allinln_sectors + intl_sectors

month = '07'
mon3c = pd.to_datetime('2016-{}-01'.format(month)).strftime('%b').lower()
filetmpl = "{dir}/2016fh_16j_{{sector}}_{dom}_month_{{month}}.ncf".format
smoketmpl = filetmpl(dir='input', dom='12US1').format
hemcotmpl = filetmpl(dir='output', dom='0pt1degree').format
urltmpl = filetmpl(dir="ftp://newftp.epa.gov/Air/emismod/2016/v1/gridded/monthly_netCDF/", dom='12US1').format

os.makedirs('input', exist_ok=True)
for sector in download_sectors:
  outpath = smoketmpl(sector=sector, month=month)
  url = urltmpl(sector=sector, month=month)
  print(url)
  os.system(f"wget -O {outpath}  --continue -q {url}")

os.system(f"wget --continue -q ftp://newftp.epa.gov/aqmg/global/gadm/gadm36_12US1.IOAPI.nc")

"""# Opening Files For Reading And Plotting

"""

smokepaths = {
    sector: smoketmpl(sector=sector, month=month)
    for sector in include_sectors + natural_sectors
}
smokefiles = {
    sector: pnc.pncopen(path, format='ioapi', mode='r')
    for sector, path in smokepaths.items()
}

reffile = smokefiles[include_sectors[0]]

"""# Store Grid Parameters for Later Use

* Regridding requires knowing about the grid structure
* We are pulling all the metadata, so that we can use what we need.

"""

gridproperties = reffile.getncatts()
exec('nominalarea = XCELL * YCELL', None, gridproperties)
exec('false_easting = -XORIG', None, gridproperties)
exec('false_northing = -YORIG', None, gridproperties)
exec('gridsize = NCOLS * NROWS', None, gridproperties)

"""# Creating Custom Merge files

* Disabling warnings
  * warns that  variables missing in the right hand file
  * and are simply copied from the first file.
* We don't need to see that.
"""

def combine(f1, f2):
  """
  Arguments
  ---------
  f1 : netcdf-like object
    left-hand side of combine
  f2 : netcdf-like object
    right-hand side of combine

  Returns
  -------
  outf : netcdf-like object
    has all variables from f2 added to f1. If a variable is in f2 but
    not f1, it is added
  
  Notes
  -----
  Warnings are about variables in f1 that are not in f2, so they are ignored
  """
  import warnings
  warnings.filterwarnings('ignore')

  outf = f1 + f2
  for k, v in f2.variables.items():
      if k not in outf.variables:
          outf.copyVariable(v, key=k)
  warnings.resetwarnings()
  return outf

def merge(files):
  """
  Arguments
  ---------
  files : iterable of netcdf-like files
    files to be combined

  Returns
  -------
  outf : netcdf-like object
    has all mass from all variables in all files
  """
  outf = files[0].copy()
  for othf in files[1:]:
    outf = combine(outf, othf)
    
  return outf

def merge_from_dict(source, keys, outpath):
  """
  Arguments
  ---------
  source : dictionary
    contains netcdf-like variables keyed by sector
  keys : iterable
    keys to use to get netcdf-like variables for merging
  outpath : str
    save the merged file to disk
  
  Returns
  -------
  None
  """
  if os.path.exists(outpath):
    os.remove(outpath)
  outf = merge([source[k] for k in keys])
  history = getattr(outf, 'HISTORY', '')
  history += '; ' + ' + '.join(keys)
  setattr(outf, 'HISTORY', history)
  ondisk = outf.save(outpath, verbose=0);
  ondisk.close()

outpath = smoketmpl(sector='merge_areaonly', month=month)
merge_from_dict(smokefiles, areaonly_sectors, outpath)

outpath = smoketmpl(sector='merge_point', month=month)
merge_from_dict(smokefiles, allinln_sectors + ptonly_sectors, outpath)

outpath = smoketmpl(sector='merge_intl', month=month)
merge_from_dict(smokefiles, intl_sectors, outpath)

outpath = smoketmpl(sector='merge_natural', month=month)
merge_from_dict(smokefiles, natural_sectors, outpath)

mergedfiles = {
    sector: pnc.pncopen(smoketmpl(sector=sector, month=month), format='ioapi', mode='r')
    for sector in ['merge_areaonly', 'merge_point', 'merge_intl', 'merge_natural']
}
outpath = smoketmpl(sector='merge', month=month)
merge_from_dict(mergedfiles, ['merge_areaonly', 'merge_point', 'merge_intl'], outpath)
mergedfiles['merge'] = pnc.pncopen(outpath, format='ioapi', mode='r')

"""# Create a Database of Sums

This will be used later for checking regridded data
"""

checkspc = 'NOX'
originalrate = {
  sector: smokefiles[sector].variables[checkspc][:].sum()
  for sector in include_sectors if checkspc in smokefiles[sector].variables
}
mergedrate = {
    k: v.variables[checkspc][:].sum() for k, v in mergedfiles.items()
}

originalrate

print(sum(originalrate.values()))
print(mergedrate)

"""# Comparing to US Report

* Download reference report for comparison
* This report contains US-only sectors
  * for cmv and beis, the smoke files include areas outside the US
  * The smoke files should always include more mass than the report.
  * All other sectors in the monthly report should match.
* Comparing US sectors excluding CMV, beis, and fire for total
"""

os.system("wget --continue ftp://newftp.epa.gov/Air/emismod/2016/v1/reports/2016fh_county_monthly_report_22jan2020_v0.csv")

monthlycounty = pd.read_csv('2016fh_county_monthly_report_22jan2020_v0.csv', comment='#')

monthlytotal = monthlycounty.groupby(['country_cd', 'census_tract_cd', 'poll']).sum().filter(like='_value').round(0)
spcmonthlytotal = monthlytotal.query('poll == "{}"'.format(checkspc)).copy()
for sector, rate in originalrate.items():
  sectork = sector.replace('emln_', '').replace('_allinln', '')
  key = ('US', sectork, checkspc)
  if key in spcmonthlytotal.index:
    spcmonthlytotal.loc[key, 'check'] = rate
  else:
    print('Skipping ', key, 'not in report')
spcmonthlytotal.round(0)

reportmatch = (
    mergedrate['merge']
    - mergedrate['merge_intl'] - originalrate['emln_cmv_c1c2_12'] - originalrate['emln_cmv_c3_12']
)
reporttotal = spcmonthlytotal.query('census_tract_cd not in ("beis", "ptfire", "ptagfire", "cmv_c1c2_12", "cmv_c3_12")').sum()
print('Report without fire, beis, cmv')
print(reporttotal.round(0))
print ('\nFor comparing to report', reportmatch)
print('Ratio', reportmatch / reporttotal[mon3c + '_value'])

"""# Create Spatial Plots

* Showing several aggregates on top
* Components shown on bottom
"""

key = 'NOX'
norm = mc.LogNorm(vmin=1e-1, vmax=1e3)
fig, axx = plt.subplots(2, 3, gridspec_kw=dict(hspace=.05, wspace=0.025), figsize=(10, 6))
cax = fig.add_axes([0.1, 0.05, .8, .025])
plotopt = dict(norm=norm)
cbaropt = dict(label='{} tons/year'.format(key), cax=cax, orientation='horizontal')
plt.sca(axx[0, 0])
mergedfiles['merge'].plot(key, plot_kw=plotopt, cbar_kw=cbaropt).set_xlabel('Total')
plt.sca(axx[0, 1])
mergedfiles['merge_point'].plot(key, plot_kw=plotopt, cbar_kw=cbaropt).set_xlabel('Point')
plt.sca(axx[0, 2])
mergedfiles['merge_areaonly'].plot(key, plot_kw=plotopt, cbar_kw=cbaropt).set_xlabel('Area')
plt.sca(axx[1, 0])
mergedfiles['merge_natural'].plot(key, plot_kw=plotopt, cbar_kw=cbaropt).set_xlabel('Natural')
plt.sca(axx[1, 1])
mergedfiles['merge_intl'].plot(key, plot_kw=plotopt, cbar_kw=cbaropt).set_xlabel('International')
axx[1, 2].set_visible(False)

"""# Prepare for Regridding By Defining Grids

* EPA12US1.grid defines a Lambert Conic Conformal grid
* latlon.grid defines a regional latitude/longitude grid
* z.grid defines the first layer of GEOS-Chem's hybrid coordinate

"""

lccpath = 'EPA12US1.grid'
llpath = 'latlon.grid'
with open(lccpath, mode='w') as gf:
  gf.write("""gridtype = projection
gridsize = {gridsize}
xsize = {NCOLS}
ysize = {NROWS}
xinc = {XCELL}
yinc = {YCELL}
xname = x
yname = y
grid_mapping_name = lambert_conformal_conic
latitude_of_projection_origin = {YCENT}
longitude_of_projection_origin = {XCENT}
longitude_of_central_meridian = {XCENT}
standard_parallel = {P_ALP}, {P_BET}
earth_radius = 6370000.
semi_major_axis = 6370000.
semi_minor_axis = 6370000.
false_easting = {false_easting}
false_northing = {false_northing}
""".format(**gridproperties))
  
with open(llpath, mode='w') as gf:
  gf.write("""gridtype  = lonlat
gridsize  = 360000
xsize     = 900
ysize     = 400
xname     = lon
xlongname = "longitude"
xunits    = "degrees_east"
yname     = lat
ylongname = "latitude"
yunits    = "degrees_north"
xfirst    = -139.95
xinc      = 0.1
yfirst    = 20.05
yinc      = 0.1
""")
  
with open('z1.grid', mode='w') as gf:
  gf.write("""zaxistype = hybrid 
size = 1
levels = 0
positive = "up"
long_name = "GEOS-Chem levels"
axis = "Z"
""")

"""# Initialize the CDO Library and Prepare Command

* Initialization is easy
* You may get a warning  

"""

from cdo import Cdo
cdo = Cdo()
# cdo.CDO = '/work/ROMO/anaconda3/envs/geo/bin/cdo

"""# Now Configure the CDO Command

cmd has 5 parts (run and described in bottom-to-top order)

* Set the vertical axis,
* rename TSTEP axis to time, remove TFLAG, set the time
* convert from tons/year to tons/s, tons/m2/s, then kg/m2/s
* Set units to mass fluxes
* Set grid to LCC and Regrid to the lon/lat grid

"""

cmdtmpl = (
    '-remapycon,{llpath} -setgrid,{lccpath} ' + # assign LCC grid and regrid
    '-setattribute,NOX@units=kgNO2/m2/s -setattribute,NO@units=kgNO2/m2/s -setattribute,*@units=kg/m2/s ' + # fix units
    '-mulc,{unitconversion} -divc,{nominalarea} -divc,{seconds} ' + # convert tons/year to kg/m2/s
    '-setreftime,1970-01-01,00:00:00,hours -settaxis,2016-{month}-01,00:00:00 -delname,TFLAG -chname,TSTEP,time ' + # fix time properties
    '-setzaxis,z1.grid -chname,LAY,lev ' +
    '{inpath}' # fix dimensions
)

"""# Regrid Each file

* Using cmd described above and applying 0 to missing values
* Output
  * NetCDF4
  * grid chunking
  * level 1 compressions

"""

remakeall = False
fileopts = '-O -f nc4 -k grid -z zip_1'
os.makedirs('output', exist_ok=True)
options = dict(
  llpath=llpath, lccpath=lccpath, nominalarea=gridproperties['nominalarea'],
  unitconversion=907.185, month=month
)

cdo_sectors = include_sectors + list(mergedfiles)

for sector in cdo_sectors:
  inpath = smoketmpl(sector=sector, month=month)
  outpath = hemcotmpl(sector=sector, month=month)
  if os.path.exists(outpath) and not remakeall:
    print('Keeping cached:', outpath)
    continue
  else:
    print(outpath, end='', flush=True)
  f = pnc.pncopen(inpath, format='ioapi')
  sdate = f.getTimes()[0]
  ndays = monthrange(sdate.year, sdate.month)[1]
  options['seconds'] = ndays * 24 * 3600
  if os.path.exists(outpath):
    os.remove(outpath)
  cmd = cmdtmpl.format(inpath=inpath, **options)
  print(cmd,end='.')
  cdo.setmisstoc(0, input=cmd, output=outpath, options=fileopts)
  print('.done', flush=True)
print('Success: all complete')

"""# Check Mass Conservation

* Calculate the Mass from Regridded Fluxes
* Get the Area of the New Grid
* Convert to mass comparable to CMAQ
  * Multiply the area by the fluxes
  * Convert per second rates to per year
  * Convert kg to short tons


"""

areaf = cdo.gridarea(llpath, input='-remapnn,%s -stdatm,0' % llpath, options='-f nc4', returnCdf=True)
area = areaf.variables['cell_area'][:]

hemco2dfiles = {
  sector: pnc.pncopen(hemcotmpl(sector=sector, month=month), format='netcdf', mode='r')
  for sector in include_sectors
}

hemco2dmergedfiles = {
  sector: pnc.pncopen(hemcotmpl(sector=sector, month=month), format='netcdf', mode='r')
  for sector in list(mergedfiles)
}
hemco2drate = {}
for sector in include_sectors:
  sfile = hemco2dfiles[sector]
  if checkspc not in sfile.variables:
    continue
  sdate =  sfile.getTimes()[0]
  ndays = monthrange(sdate.year, sdate.month)[1]
  checkval = (sfile.variables[checkspc][:].sum((0, 1)) * area).sum() * ndays * 24 * 3600 * 0.00110231
  hemco2drate[sector] = checkval

hemco2dmergedrate = {}
for sector in list(mergedfiles):
  sfile = hemco2dmergedfiles[sector]
  if checkspc not in sfile.variables:
    continue
  sdate =  sfile.getTimes()[0]
  ndays = monthrange(sdate.year, sdate.month)[1]
  checkval = (sfile.variables[checkspc][:].sum((0, 1)) * area).sum() * ndays * 24 * 3600 * 0.00110231
  hemco2dmergedrate[sector] = checkval

hemco2dtotal = sum(hemco2drate.values())
hemco2dmergedtotal = (hemco2dmergedrate['merge_areaonly'] + hemco2dmergedrate['merge_point'] + hemco2dmergedrate['merge_intl'])
hemco2dreportmatch = (
    hemco2dmergedrate['merge']
    - hemco2dmergedrate['merge_intl'] - hemco2drate['emln_cmv_c1c2_12'] - hemco2drate['emln_cmv_c3_12']
)
print('Report without fire, beis, cmv')
print(reporttotal.round(0))
print ('\nFor comparing to report', hemco2dreportmatch)
print('Ratio', hemco2dreportmatch / reporttotal[mon3c + '_value'])

hemco2dtotal = sum(hemco2drate.values())
hemco2dmergedtotal = (hemco2dmergedrate['merge_areaonly'] + hemco2dmergedrate['merge_point'] + hemco2dmergedrate['merge_intl'])
print('ORIG', mergedrate['merge'])
print('HEMCO2D merged ar/pt/intl', hemco2dtotal)
print('HEMCO2D single merged', hemco2dmergedtotal)
print('ORIG/HEMCO2D', hemco2dtotal / mergedrate['merge'])
hemco2dmergedrate

for key in 'NOX APIN BPIN ISOP SESQ TERP'.split():
  for sector in include_sectors:
    sfile = hemco2dfiles[sector]
    if key not in sfile.variables:
        continue
    vals = sfile.variables[key][:]
    print('{}: {} sum={:.7g}'.format(sector, key, (vals * area).sum() * 3600 * 24 * 366 * 0.00110231))

"""# Plot Results

* Always good to take a look at NOX
* Showing 2D results
* cmv c1, c2, c3 has international emissions, which explains the small emission masses in Canada.

"""

spc = 'NOX'
fig, axx = plt.subplots(2, 2, gridspec_kw=dict(bottom=0.15))
cax = fig.add_axes([.1, .05, .8, .025])
plotopts=dict(norm=mc.LogNorm())
cbaropts = dict(cax=cax, orientation='horizontal')
plt.sca(axx[0, 0])
hemco2dmergedfiles['merge'].plot(spc, plottype='lon-lat', plot_kw=plotopts, cbar_kw=cbaropts)
axx[0, 0].set_ylabel("total")
plt.sca(axx[1, 0])
hemco2dmergedfiles['merge_areaonly'].plot(spc, plottype='lon-lat', plot_kw=plotopts, cbar_kw=cbaropts)
axx[1, 0].set_ylabel('Gridded')
plt.sca(axx[1, 1])
hemco2dmergedfiles['merge_point'].plot(spc, plottype='lon-lat', plot_kw=plotopts, cbar_kw=cbaropts)
axx[1, 1].set_ylabel('emln')
axx[1, 1].yaxis.set_label_position("right")
plt.sca(axx[0, 1])
hemco2dmergedfiles['merge_intl'].plot(spc, plottype='lon-lat', plot_kw=plotopts, cbar_kw=cbaropts)
axx[0, 1].set_ylabel('intl')
axx[0, 1].yaxis.set_label_position("right")
from mpl_toolkits.basemap import Basemap
bmap = Basemap(llcrnrlat=20, llcrnrlon=-137.5, urcrnrlat=60, urcrnrlon=-50, resolution='c')
for ax in axx.ravel():
  bmap.drawcoastlines(ax=ax)
  bmap.drawcountries(ax=ax)

"""# Vertical Allocation Table

* Oversimplifying vertical allocation
  * Similar to Simpson et al. 2003 Table 4.1, we apply a single representative allocation to each sector.
  * Simpson D., Fagerli, H., Jonson, J. E., Tsyro, S., Wind, P., and Tuovinen, J.: "Transboundary Acidification, Eutrophication, and Ground Level Ozone in Europe - Part I: Unified EMEP Model Description”, EMEP Status Report 2003, The  Norwegian Meteorological Institute, Oslo, 25 Norway, 2003. https://www.emep.int/publ/reports/2003/emep_report_1_part1_2003.pdf
"""

import pandas as pd
import io

vertical_allocation = pd.read_csv(io.StringIO("""L,bottom_m,top_m,othpt,ptegu,ptnonipm,pt_oilgas,ptfire,cmv_c3
1,-6,123,0.806067059,0.027331187,0.751204553,0.918700528,0.11976381,0.537538889
2,123,254,0.087955253,0.12260809,0.144224191,0.059678397,0.098297847,0.338982929
3,254,387,0.053252808,0.192017292,0.052822078,0.013323237,0.08277491,0.123478182
4,387,521,0.025605531,0.212736741,0.023840537,0.003978944,0.074659995,0
5,521,657,0.011408677,0.167549199,0.011000364,0.001724366,0.077561337,0
6,657,795,0.005787852,0.110491495,0.005661267,0.001038275,0.073349589,0
7,795,934,0.003758579,0.063653198,0.003165207,0.000633395,0.066694526,0
8,934,1075,0.002673887,0.036979211,0.00197344,0.000387233,0.061845442,0
9,1075,1218,0.001675656,0.022844579,0.001300664,0.000243355,0.058324971,0
10,1218,1363,0.000670159,0.013909341,0.000861865,0.000122994,0.053902287,0
11,1363,1510,0.000357644,0.008781226,0.000621856,6.48583E-05,0.044774262,0
12,1510,1659,0.00022693,0.006647819,0.00052355,4.04985E-05,0.041122289,0
13,1659,1860,0.000159139,0.004305749,0.000466642,2.12041E-05,0.0306914,0
14,1860,2118,0.000152154,0.003380562,0.000680222,1.75333E-05,0.031529605,0
15,2118,2382,0.000113674,0.001827346,0.000612964,1.03278E-05,0.02551535,0
16,2382,2654,8.79442E-05,0.000957313,0.000260885,5.76406E-06,0.020748307,0
17,2654,2932,3.14778E-05,0.000523901,0.000311535,3.23215E-06,0.014366484,0
18,2932,3219,1.29957E-05,0.000381747,0.000231719,2.27428E-06,0.010906613,0
19,3219,3665,1.37334E-06,0.000409744,0.000115745,1.78044E-06,0.009224629,0
20,3665,4132,3.15633E-07,0.000340645,4.19637E-05,6.00909E-07,0.003409241,0
21,4132,4623,3.71912E-07,0.00026689,1.84353E-05,4.14262E-07,0.00014894,0
22,4623,5142,4.01952E-07,0.000231509,1.03385E-05,3.73296E-07,0.000125351,0
23,5142,5692,1.17457E-07,0.000225471,9.11318E-06,3.01765E-07,8.76892E-05,0
24,5692,6277,0,0.000310076,9.82452E-06,1.13299E-07,3.24343E-05,0
25,6277,6905,0,0.000327088,1.07925E-05,0,3.0049E-05,0
26,6905,7582,0,0.000260516,1.2311E-05,0,2.85539E-05,0
27,7582,8320,0,0.000130502,4.58591E-06,0,2.08973E-05,0
28,8320,9409,0,0.000119402,2.72213E-06,0,3.26732E-05,0
29,9409,10504,0,0.000106184,6.30588E-07,0,1.74394E-05,0
30,10504,11578,0,9.07766E-05,0,0,1.30783E-05,0
31,11578,12633,0,0.000117164,0,0,0,0
32,12633,13674,0,0.000122616,0,0,0,0
33,13674,14706,0,1.54213E-05,0,0,0,0
34,14706,15731,0,0,0,0,0,0
35,15731,16753,0,0,0,0,0,0
36,16753,17773,0,0,0,0,0,0
37,17773,19855,0,0,0,0,0,0
38,19855,22004,0,0,0,0,0,0
39,22004,24240,0,0,0,0,0,0
40,24240,26596,0,0,0,0,0,0
41,26596,31716,0,0,0,0,0,0
42,31716,37574,0,0,0,0,0,0
43,37574,44286,0,0,0,0,0,0
44,44286,51788,0,0,0,0,0,0
45,51788,59924,0,0,0,0,0,0
46,59924,68392,0,0,0,0,0,0
47,68392,80581,0,0,0,0,0,0
""")).set_index(['L'])

"""# Set a Maximum Number of Layers

* Some layers have non-zero values up to layer 33
  * This seems unreasonable.
  * Simpson et al. 2003 had up to layer 11 in this vertical coordinate
* Setting a maximum top layer of 11
  * renormalizing mass within those 11 layers.
"""

nl = 11
vertical_allocation_layers = vertical_allocation.loc[1:nl]
vertical_allocation_layer_fraction = vertical_allocation_layers / vertical_allocation_layers.sum()

"""# Apply Vertical Allocation

* Map each smoke sector to a vertical allocation
* Manually fixing meta data
  * (hyai, hybi, and lev)
  * renaming dimensions to ilev
"""

import gc


sector2verticalallocation = {
  'emln_cmv_c1c2_12': 'cmv_c3',
  'emln_cmv_c3_12': 'cmv_c3',
  'emln_othpt': 'othpt',
  'emln_pt_oilgas_allinln': 'pt_oilgas',
  'emln_ptegu': 'ptegu',
  'emln_ptnonipm_allinln': 'ptnonipm'
}

for sector in ptonly_sectors + allinln_sectors:
  gc.collect()
  inpath = hemcotmpl(sector=sector, month=month)
  outpath = inpath.replace('0pt1degree', '0pt1degree_3D')
  origlayerfractions = vertical_allocation_layer_fraction[sector2verticalallocation[sector]].values
  nl = (origlayerfractions == 0).argmax()
  if nl == 0:
    nl = len(origlayerfractions)
  layerfractions = origlayerfractions[:nl]
  layerfractions /= layerfractions.sum()
  print(sector, origlayerfractions)
  print(sector, nl, layerfractions)
  print('Start.', outpath, end='')
  if os.path.exists(outpath):
    os.remove(outpath)
  emln2df = pnc.pncopen(inpath, format='netcdf')
  emln3df = emln2df.sliceDimensions(lev=[0] * nl).sliceDimensions(lev=[0] * nl)#.sliceDimensions(nhyi=[0] * (nl + 1))
  for key, var in emln3df.variables.items():
    if 'lev' in var.dimensions:
      newshape = tuple([(slice(None) if dk == 'lev' else None) for dk in var.dimensions])
      var[:] = var[:] * layerfractions[newshape]
  # hyai = emln3df.variables['hyai'][:] = [
  #   0.000000e+00, 4.804826e-02, 6.593752e+00, 1.313480e+01, 1.961311e+01, 2.609201e+01,
  #   3.257081e+01, 3.898201e+01, 4.533901e+01, 5.169611e+01, 5.805321e+01, 6.436264e+01
  # ][:nl+1]
  # hybi = emln3df.variables['hybi'][:] = [
  #   1.000000e+00, 9.849520e-01, 9.634060e-01, 9.418650e-01, 9.203870e-01, 8.989080e-01,
  #   8.774290e-01, 8.560180e-01, 8.346609e-01, 8.133039e-01, 7.919469e-01, 7.706375e-01
  # ][:nl+1]
  # emln3df.variables['hybm'][:] = np.convolve([.5, .5], hybi, mode='valid')
  # emln3df.variables['hyam'][:] = np.convolve([.5, .5], hyai, mode='valid')
  lev = emln3df.createVariable('lev', 'd', ('lev',))
  lev.long_name = "GEOS-Chem levels" ;
  lev.standard_name = "GEOS-Chem levels" ;
  lev.units = "model level" ;
  lev.positive = "up" ;
  lev[:] = np.arange(nl, dtype='i')
  # Requires special treatment in variable definition
  # to match geos-chem optimized chunking.
  outnc = emln3df.copy(variables=False).save(outpath, verbose=0)
  dimlens = {k: len(v) for k, v in emln3df.dimensions.items()}
  chunksizes = dimlens.copy()
  chunksizes['lev'] = 1
  chunksizes['time'] = 1
  for vark in emln3df.dimensions:
    if vark not in emln3df.variables:
        continue
    var = emln3df.variables[vark]
    opts = dict(complevel=0, chunksizes=tuple([dimlens[dk] for dk in var.dimensions]))
    ovar = outnc.createVariable(vark, var.dtype.char, var.dimensions, **opts)
    ovar.setncatts(var.getncatts())
    ovar[:] = var[:]
    
  for vark, var in emln3df.variables.items():
    if vark in emln3df.dimensions:
        continue
    opts = dict(complevel=1, chunksizes=tuple([chunksizes[dk] for dk in var.dimensions]))
    ovar = outnc.createVariable(vark, var.dtype.char, var.dimensions, **opts)
    ovar.setncatts(var.getncatts())
    ovar[:] = var[:]
    
  outnc.close()
  print('.Complete')

"""# Now Do a Mass Conservation Check

* Open the 3d file
* Open the reference file
* Sum the values and compare
"""

from glob import glob

emln3dpaths = sorted(glob('output/*_3D_*.ncf'))
for inpath in emln3dpaths:
  refpath = inpath.replace('_3D', '')
  f3d = pnc.pncopen(inpath)
  fref = pnc.pncopen(refpath)
  print(refpath)
  print('emln   {:2d} lev NOX {:.7e} sum kgNO2/m2/s'.format(len(fref.dimensions['lev']), fref.variables['NOX'][:].sum()))
  print('emln3d {:2d} lev NOX {:.7e} sum kgNO2/m2/s'.format(len(f3d.dimensions['lev']), f3d.variables['NOX'][:].sum()))
  print('Within numerical precision', np.allclose(fref.variables['NOX'][:, 0], f3d.variables['NOX'][:].sum(1)))

"""# Configure Translation from CMAQ to GEOS-Chem

* Translation is not always 1:1
* Apply scaling factors
  * Diurnal scale factors (25, 26)
  * Day of week (210-222)
  * Split OC/EC (70-73)
  * Convert NO kgNO2/m2/s to kgNO/m2/s (115)
"""

cq2gc = {
    'ACET': [['ACET', '26/213/1007']],
    'ALD2': [['ALD2', '26/213/1007']],
    'ALDX': [['RCHO', '26/213/1007']],
    'ACROLEIN': [['MACR', '26/213/1007']],
    'BENZ': [['BENZ', '26/213/1007']],
    'ECH4': [['CH4', '1007']],
    'CL': [['Cl', '1007']],
    'CL2': [['Cl2', '1007']],
    'CLO': [['ClO', '1007']],
    'CLNO2': [['ClNO2', '1007']],
    'CLNO3': [['ClNO3', '1007']],
    'CO': [['CO', '26/211/1007']],
    'ETOH': [['EOH', '26/213/1007']],
    'ETHA': [['C2H6', '26/217/1007']],
    'ETH': [['C2H4', '26/213/1007']],
    'FACD': [['HCOOH', '26/213/1007']],
    'FORM': [['CH2O', '26/213/1007']],
    'GLY': [['GLYX', '26/214/1007']], # Following MEK
    'GLYD': [['GLYC', '26/214/1007']], # Following MEK
    'HCL': [['HCl', '1007']],
    'HOCL': [['HOCl', '1007']],
    'HONO': [['HNO2', '25/210/1007']],
    'HPLD': [['HPALD', '26/214/1007']], # Following MEK
    'IOLE': [['PRPE', '26/215/1007']],
    'KET': [['MEK', '26/214/1007']],
    'MEPX': [['MP', '26/213/1007']], # Following MEOH
    'MEOH': [['MOH', '26/213/1007']],
    'NH3': [['NH3', '26/213/1007']],
    'NO': [['NO', '115/25/210/1007']],
    'NO2': [['NO2', '25/210/1007']],
    'OLE': [['PRPE', '26/215/1007']],
    'PACD': [['MAP', '26/213/1007']], # Following MEOH
    'PAR': [['ALK4', '26/212/1007']],
    'PNA': [['HNO4', '26/213/1007']], # Following MEOH
    'PRPA': [['C3H8', '26/216/1007']],
    'SO2': [['SO2', '26/218/1007']],
    'SULF': [['SO4', '26/218/1007']],
    'TOL': [['TOLU', '26/213/1007']],
    'PEC': [['BCPI', '26/221/1007/70'], ['BCPO', '26/221/256/1007/71']],
    'PFE': [['pFe', '26/219/1007']],
    'POC': [['OCPI', '26/222/1007/72'], ['OCPO', '26/222/257/1007/73']],
    'PNH4': [['NH4', '26/218/1007']], # need to confirm this
    'PNO3': [['NIT', '26/218/1007']], # need to confirm this
    'PSO4': [['SO4', '26/219/1007']],
    'XYLMN': [['XYLE', '26/213/1007']],
}
# Ignore special species
exec('ALD2_PRIMARY = FORM_PRIMARY = NH3_FERT = []', None, cq2gc)
# ignore inventory meta variables
exec('HFLUX = VOC_INV = VOC_BEIS = CO2_INV = N2O_INV = CH4_INV = []', None, cq2gc)
# Ignore duplicative species
# TOLU in TOL
# NOX = NO + NO2 + HONO
# CH4 in ECH4
exec('CH4 = TOLU = NOX = []', None, cq2gc)
# Ignore coordinate variables
exec('hybm = hyam = []', None, cq2gc)
# Ignore APIN, BPIN, and SESQ - they were from BEIS
# and were removed
exec('APIN = BPIN = SESQ = []', None, cq2gc)
# ISOP and TERP are mostly biogenic, so igoring remainder
exec('ISOP = TERP = []', None, cq2gc)
# Ignore species GC doesn't have
exec('BUTADIENE13 = ETHY = NAPH = []', None, cq2gc)
# unreactive and nonvolatile are not used
exec('UNK = UNR = NVOL = NR = []', None, cq2gc)
# Ignore unused particulate species
exec('SOAALK = PAL = PCA = PCL = PFE = PH2O = PK = PSI = PTI = []', None, cq2gc)
exec('PM2_5 = PMC = PMG = PMN = PMOTHR = PNCOM = P = PCA = PCL = PFE = PH2O = PK = []', None, cq2gc)

"""# Define HEMCO_Config.rc Writer

* Create a HEMCO_Config.rc that reads all included files
* Only needed to be made once and editted to use $MM instead of 01..12

"""

def changepathtopattern(path):
  return '$ROOT/NEI2016fh/' + path.replace('month_' + month, 'month_$MM')

def getsector(hcpath):
    sector = hcpath.split('16j_')[1].split('_0pt')[0]
    sector = sector.replace('emln_', '').replace('_allinln', '').replace('_12', '').replace('_adj', '').replace('cmv_', '')
    sector = sector.replace('_oilgas', 'og')
    return sector

def writeconfig(sectors, outpath):
    hcpaths = []
    for sector in sectors:
      hemco2dpath = hemcotmpl(sector=sector, month=month)
      hemco3dpath = hemco2dpath.replace('0pt1degree', '0pt1degree_3D')
      if os.path.exists(hemco3dpath):
          hcpaths.append(hemco3dpath)
      elif os.path.exists(hemco2dpath):
          hcpaths.append(hemco2dpath)
      else:
          raise KeyError('Could not find regridded: ' + hemco2dpath)

    defaults = set()
    ignores = set()
    with open(outpath, 'w') as hcf:
        hcf.write('(((EPA2016_MONMEAN\n')
        for hcpath in hcpaths:
            hcpatt = changepathtopattern(hcpath)
            sector = getsector(hcpath)
            print(sector, hcpatt, end='', flush=True)
            hcfile = pnc.pncopen(hcpath, format='netcdf')
            for cqkey, v in hcfile.variables.items():
                if cqkey in hcfile.dimensions or cqkey in ('hyai', 'hybi'):
                  continue
                elif cqkey in ('TOLU',):
                  warn('TOLU mass is duplicated by TOL')
                if cqkey in cq2gc:
                  gctrans = cq2gc.get(cqkey)
                  if len(gctrans) == 0:
                    ignores.add(cqkey)
                else:
                  defaults.add(cqkey)
                  gctrans = [[cqkey, '1007']]
                for gckey, scale in gctrans:
                  if gckey in ['ACET', 'MEK', 'ALD2', 'PRPE', 'PRPA', 'BENZ', 'TOLU', 'XYLE', 'EOH', 'ALK4', 'ISOP']:
                    units = 'kgC/m2/s'
                  else:
                    units = v.units.strip()
                  opts = dict(
                    unit=units,
                    gckey=gckey,
                    cqkey=cqkey,
                    sector=sector,
                    path=hcpatt,
                    scale=scale,
                    cat='1/2',
                    hier=50
                  )
                  hcf.write('0 EPA16_{gckey}__{sector}{cqkey} {path}  {cqkey}       2016-2016/1-12/1/0 C xyz  {unit}  {gckey}   {scale}     {cat} {hier}\n'.format(**opts))
                  # If I use - to repeat the file, the mass is from the previous cqkey too.
                  # hcpatt = '-'
            print()
        hcf.write(')))EPA2016_MONMEAN\n')
    print('Ignored', sorted(ignores))
    print('Defaults', sorted(defaults))

"""# Choose paths to use in HEMCO

* hemco3d files take precedence
* hemco2d are the default
* Must have one or the other
"""

writeconfig(include_sectors, 'output/EPA_Config.rc')
writeconfig(['merge_areaonly', 'merge_intl'] + ptonly_sectors + allinln_sectors, 'output/EPAMERGED_Config.rc')

"""# Review the HEMCO file"""

less output/EPA_Config.rc

"""# Summary

You've created 3 important in the output folder.
1. A 3D emln file with no fires.
2. A 2D gridded file with no beis
3. A HEMCO_Config.rc file

The are two additional files that are just references:
1. tot_nobeis_nofire is a refernce fo the expected total
2. emln_nofire (non-3D)

Download the output directory
"""

os.system('zip output.zip output/*')

ls -lh output.zip