#!/usr/bin/env python3

"""
Description:
------------
This Python script (assumes Python3) reads a GEOS-Chem or
HEMCO-standalone log file containing dry-run output and does
the following:

    (1) Creates a list of unique files that are required for the
        GEOS-Chem or HEMCO-standalone simulation;

    (2) Creates a bash script to download missing files from either
        the ComputeCanada server (default) or the AWS s3://gcgrid
        bucket;

    (3) Executes the bash script to download the necessary data;

    (4) Removes the bash script upon successful download.


Remarks:
--------
    (1) This script only requires the "os", "sys", and "subprocess"
        packages, which are core Python.  Therefore, this script can
        be shipped with GEOS-Chem run directories.  It only requires
        Python 3 and not a full Anaconda/Miniconda environment (but
        you can run in an Anaconda environment if you have one).

    (2) Jiawei Zhuang found that it is much faster to issue aws s3 cp
        commands from a bash script than a Python script.  Therefore,
        in this routine we create a bash script with all of the
        download commands that will be executed by the main routine.
"""

# Imports
import os
import sys
import subprocess

# Exit with error if we are not using Python3
assert sys.version_info.major >= 3, \
"ERROR: Python 3 is required to run download_data.py!"

# Define global variables
INPUT_GEOS_FILE = "./input.geos"
DATA_DOWNLOAD_SCRIPT = "./auto_generated_download_script.sh"


def extract_pathnames_from_log(dryrun_log):
    """
    Returns a list of pathnames from a GEOS-Chem log file.

    Args:
    -----
        dryrun_log : str
            GEOS-Chem or HEMCO-standalone log file with dry-run output.

    Returns:
    --------
        paths : dict
            paths["comments"]: Dry-run comment lines.
            paths["found"] : List of file paths found on disk.
            paths["missing"]: List of file paths that are missing.
            paths["local_prefix"]: Local data directory root.

    Author:
    -------
        Jiawei Zhuang (jiaweizhuang@g.harvard.edu)
        Modified by Bob Yantosca (yantosca@seas.harvard.edu)
    """

    # Initialization
    comments = ["!"*79,
                "!!! LIST OF (UNIQUE) FILES REQUIRED FOR THE SIMULATION"]
    data_found = set()
    data_missing = set()

    # Open file (or die with error)
    try:
        f = open(dryrun_log, "r")
    except FileNotFoundError:
        raise FileNotFoundError("Could not find file {}".format(dryrun_log))

    # Read data from the file line by line.
    # Add file paths to the data_list set.
    line = f.readline()
    while line:

        # Convert line to uppercase for string match
        upcaseline = line.upper()

        # Search for data paths that have been found
        if (": OPENING" in upcaseline) or (": READING" in upcaseline):
            data_found.add(line.split()[-1])

        # Search for data paths that are missing
        elif "FILE NOT FOUND" in upcaseline:
            data_missing.add(line.split()[-1])

        # Search for certain dry-run comment strings
        # (and make sure to prevent duplicates)
        elif ("!!! STA" in upcaseline) or ("!!! END" in upcaseline) or \
             ("!!! SIM" in upcaseline) or ("!!! MET" in upcaseline) or \
             ("!!! GRI" in upcaseline):
            if line.rstrip() not in comments:
                comments.append(line.rstrip())

        else:
            pass

        # Read next line
        line = f.readline()

    # Add another line to the comment list
    comments.append("!"*79)

    # Convert sets to lists and sort in alphabetical order
    found = sorted(list(data_found))
    missing = sorted(list(data_missing))

    # Find the local data directory prefix (path to ExtData)
    local_prefix = ""
    for path in found + missing:
        if "ExtData" in path:
            index = path.find("ExtData")
            local_prefix = path[:index]
            break

    # Exit if the local path does not contain ExtData
    if len(local_prefix) == 0:
        msg = "Could not locate the ExtData folder in your local disk space!"
        raise ValueError(msg)

    # Close file and return
    # The "sorted" command will return unique values
    f.close()
    return {"comments": comments, "found": found,
            "missing": missing, "local_prefix": local_prefix}


def get_run_info():
    """
    Searches through the input.geos file for GEOS-Chem run parameters.

    Returns:
    -------
        run_info : dict of str
            Contains the GEOS-Chem run parameters: start_date,
            start_time, end_date, end_time, met, grid, and sim.
    """
    run_info = {}
    run_info["nest"] = ""

    try:
        with open(INPUT_GEOS_FILE, "r") as f:
            for line in f:
                if "Start YYYYMMDD" in line:
                    substr = line.split(":")[1]
                    run_info["start_date"] = (substr.split(" ")[1]).strip()
                    run_info["start_time"] = (substr.split(" ")[2]).strip()
                elif "End   YYYYMMDD" in line:
                    substr = line.split(":")[1]
                    run_info["end_date"] = (substr.split(" ")[1]).strip()
                    run_info["end_time"] = (substr.split(" ")[2]).strip()
                elif "Met field" in line:
                    run_info["met"] = (line.split(":")[1]).strip()
                elif "Simulation name" in line:
                    run_info["sim"] = (line.split(":")[1]).strip()
                elif "Grid resolution" in line:
                    grid = (line.split(":")[1]).strip()

                    # Adjust grid string to match file names
                    if "4.0x5.0" in grid:
                        run_info["grid"] = "4x5"
                    elif "2.0x2.5" in grid:
                        run_info["grid"] = "2x25"
                    elif "0.5x0.625" in grid:
                        run_info["grid"] = "05x0625"
                    elif "0.25x0.3125" in grid:
                        run_info["grid"] = "025x03125"
                elif "Longitude" in line:
                    if "-130.0" in line or "-140.0" in line:
                        run_info["nest"] = "na"                        
                        break
                    elif "60.0" in line or "70.0" in line:
                        run_info["nest"] = "as"
                        break
                    break
            f.close()
    except FileNotFoundError:
        raise FileNotFoundError("Could not open {}".format(INPUT_GEOS_FILE))

    return run_info


def expand_restart_file_names(paths, run_info):
    """
    Tests if the GEOS-Chem restart file is a symbolic link to
    ExtData.  If so, will append the link to the remote file
    to the line in which the restart file name is found.

    Args:
    ----
        paths : dict
            Output of function extract_pathnames_from_log.

        run_info : dict
            Output of function get_run_info.
    """
    prefix = ""

    # Get the prefix to ExtData
    for path in paths["found"] + paths["missing"]:
        if "ExtData" in path:
            index = path.find("ExtData")+8
            prefix = path[0:index] + "GEOSCHEM_RESTARTS/v2018-11/"
            break

    # Search for the restart file name in the found files
    new_list = []
    
    # Suffix string (takes into account nested grids)
    if run_info["nest"] == "":
        suffix = "{}.nc".format(run_info["sim"])
    else:
        suffix = "{}_{}.nc".format(run_info["sim"], run_info["nest"])
    
    for path in paths["found"]:
        if "GEOSChem.Restart" in path:
            realpath = prefix + "initial_GEOSChem_rst." + \
                       run_info["grid"] + "_" + suffix
            # --------------------------------------------------------
            # KLUDGE to replace geosfp "as" file name with "ch"
            # since symbolic links do not work on AWS s3://gcgrid
            realpath = realpath.replace("025x03125_tropchem_as.nc",
                                        "025x03125_tropchem_ch.nc")
            # --------------------------------------------------------
            path = path + " --> " + realpath
        new_list.append(path)
    paths["found"] = sorted(new_list)

    # Search for the restart file name in the missing files
    new_list = []
    for path in paths["missing"]:
        if "GEOSChem.Restart" in path:
            realpath = prefix + "initial_GEOSChem_rst." + \
                       run_info["grid"] + "_" + suffix
            # --------------------------------------------------------
            # KLUDGE to replace geosfp "as" file name with "ch"
            # since symbolic links do not work on AWS s3://gcgrid
            realpath = realpath.replace("025x03125_tropchem_as.nc",
                                        "025x03125_tropchem_ch.nc")
            # --------------------------------------------------------
            path = path + " --> " + realpath
        new_list.append(path)
    paths["missing"] = sorted(new_list)

    # Return the updated data paths
    return paths


def write_unique_paths(paths, unique_log):
    """
    Writes unique data paths from dry-run output to a file.

    Args:
    -----
        paths : dict
            Output of function extract_pathnames_from_log.

        unique_log : str
            Log file that will hold unique data paths.
    """
    combined_paths = paths["found"] + paths["missing"]
    combined_paths.sort()

    try:
        with open(unique_log, "w") as f:
            for comment in paths["comments"]:
                print(comment, file=f)
            for path in combined_paths:
                print(path, file=f)
            for comment in paths["comments"]:
                print(comment, file=f)
        f.close()
        print("Log with unique file paths written to: {}".format(unique_log))
    except FileNotFoundError:
        raise FileNotFoundError("Could not write {}".format(unique_log))


def create_download_script(paths, from_aws=False):
    """
    Creates a data download script to obtain missing files
    from the ComputeCanada data archive (default), or the
    GEOS-Chem s3://gcgrid bucket on the AWS cloud,

    Args:
    -----
        paths : dict
            Output of function extract_pathnames_from_log.

        from_aws : bool
            If True, download from AWS s3://gcgrid.
            If False, download from ComputeCanada (default).
    """

    # Define variables to create data download commands
    # for either ComputeCanada or AWS
    if from_aws:
        cmd_prefix = "aws s3 cp --request-payer=requester "
        remote_root = "s3://gcgrid"
        quote = ""
    else:
        cmd_prefix = 'wget -r -np -nH -R "*.html" -N -P ' + \
                     paths["local_prefix"] + " "
        remote_root = "http://geoschemdata.computecanada.ca/ExtData"
        quote = '"'

    # Create the data download script
    with open(DATA_DOWNLOAD_SCRIPT, "w") as f:

        # Write shebang line to script
        print("#!/bin/bash\n", file=f)
        print("# This script was generated by download_data.py\n", file=f)

        # Write download commands for only the missing data files
        for path in paths["missing"]:

            if "-->" in path:

                # ------------------------------------------------------
                # Edge case: Linked restart files
                # ------------------------------------------------------

                # First copy the restart file to local ExtData
                remote_rst = (path.split("-->")[1]).strip()
                local_rst = (path.split("-->")[0]).strip()
                index1 = remote_rst.find("initial")
                index2 = remote_rst.find("ExtData") + 7
                prefix = remote_rst[0:index1]
                remote_rst = remote_root + remote_rst[index2:]
                cmd = cmd_prefix + quote + remote_rst + quote
                if from_aws:
                    cmd += " " + prefix
                print(cmd, file=f)
                print(file=f)

                # Remove the prior link for safety's sake
                cmd = "if [[ -L " + local_rst + " ]]; then unlink " + \
                      local_rst + "; fi"
                print(cmd, file=f)

                # Then create a symbolic link from the run directory
                # to the restart file in the local ExtData
                index3 = remote_rst.find("initial")
                cmd = "ln -s " + prefix + remote_rst[index3:] + \
                      " " + local_rst
                print(cmd, file=f)
                print(file=f)

            elif "gmi.clim.IPMN.geos5.2x25.nc" in path:

                # ------------------------------------------------------
                # Edge case: GMI IPMN file is really the PMN file
                # ------------------------------------------------------

                # Download the PMN file
                index = path.find("ExtData") + 7
                local_dir = os.path.dirname(path)
                remote_path = remote_root + path[index:]
                remote_path = remote_path.replace("IPMN", "PMN")
                cmd = cmd_prefix + quote + remote_path + quote
                if from_aws:
                    cmd += " " + local_dir + "/"
                print(cmd, file=f)

                # Rename it to IPMN
                cmd = "mv " + local_dir + "/gmi.clim.PMN.geos5.2x25.nc " + \
                      local_dir + "/gmi.clim.IPMN.geos5.2x25.nc"
                print(cmd, file=f)

            elif "gmi.clim.NPMN.geos5.2x25.nc" in path:

                # ------------------------------------------------------
                # Edge case: GMI NPMN file is really the PMN file
                # ------------------------------------------------------

                # Download the PMN file
                index = path.find("ExtData") + 7
                local_dir = os.path.dirname(path)
                remote_path = remote_root + path[index:]
                remote_path = remote_path.replace("NPMN", "PMN")
                cmd = cmd_prefix + quote + remote_path + quote
                if from_aws:
                    cmd += " " + local_dir + "/"
                print(cmd, file=f)

                # Rename it to NPMN
                cmd = "mv " + local_dir + "/gmi.clim.PMN.geos5.2x25.nc " + \
                      local_dir + "/gmi.clim.NPMN.geos5.2x25.nc"
                print(cmd, file=f)
                print(file=f)

            elif "gmi.clim.RIPA.geos5.2x25.nc" in path:

                # ------------------------------------------------------
                # Edge case: GMI RIPA file is really the RIP file
                # ------------------------------------------------------

                # Download the RIP file
                index = path.find("ExtData")+7
                local_dir = os.path.dirname(path)
                remote_path = remote_root + path[index:]
                remote_path = remote_path.replace("RIPA", "RIP")
                cmd = cmd_prefix + quote + remote_path + quote
                if from_aws:
                    cmd += " " + local_dir + "/"
                print(cmd, file=f)

                # Rename it to NPMN
                cmd = "mv " + local_dir + "/gmi.clim.RIP.geos5.2x25.nc " + \
                      local_dir + "/gmi.clim.RIPA.geos5.2x25.nc"
                print(cmd, file=f)
                print(file=f)

            elif "gmi.clim.RIPB.geos5.2x25.nc" in path:

                # ------------------------------------------------------
                # Edge case: GMI RIPB file is really the RIP file
                # ------------------------------------------------------

                # Download the RIP file
                index = path.find("ExtData")+7
                local_dir = os.path.dirname(path)
                remote_path = remote_root + path[index:]
                remote_path = remote_path.replace("RIPB", "RIP")
                cmd = cmd_prefix + quote + remote_path + quote
                if from_aws:
                    cmd += " " + local_dir + "/"
                print(cmd, file=f)

                # Rename it to RIPB
                cmd = "mv " + local_dir + "/gmi.clim.RIP.geos5.2x25.nc " + \
                      local_dir + "/gmi.clim.RIPB.geos5.2x25.nc"
                print(cmd, file=f)
                print(file=f)

            elif "gmi.clim.RIPD.geos5.2x25.nc" in path:

                # ------------------------------------------------------
                # Edge case: GMI RIPD file is really the RIP file
                # ------------------------------------------------------

                # Download the RIP file
                index = path.find("ExtData")+7
                local_dir = os.path.dirname(path)
                remote_path = remote_root + path[index:]
                remote_path = remote_path.replace("RIPD", "RIP")
                cmd = cmd_prefix + quote + remote_path + quote
                if from_aws:
                    cmd += " " + local_dir + "/"
                print(cmd, file=f)

                # Rename it to RIPD
                cmd = "mv " + local_dir + "/gmi.clim.RIP.geos5.2x25.nc " + \
                      local_dir + "/gmi.clim.RIPD.geos5.2x25.nc"
                print(cmd, file=f)
                print(file=f)

            elif "ExtData" in path:

                # ------------------------------------------------------
                # All other files in ExtData
                # ------------------------------------------------------
                index = path.find("ExtData") + 7
                local_dir = os.path.dirname(path)
                remote_path = remote_root + path[index:]
                cmd = cmd_prefix + quote + remote_path + quote
                if from_aws:
                    cmd += " " + local_dir + "/"
                print(cmd, file=f)
                print(file=f)

        # Kludge: Create a ExtData/CHEM_INPUTS folder if it
        # does not exist. This will prevent abnormal exits.
        chem_inputs_dir = paths["local_prefix"] + 'ExtData/CHEM_INPUTS'
        cmd = "if [[ ! -d {} ]]; then mkdir {}; fi".format(
            chem_inputs_dir, chem_inputs_dir)
        print(cmd, file=f)
        print(file=f)

        # Close file and make it executable
        f.close()
        os.chmod(DATA_DOWNLOAD_SCRIPT, 0o755)


def download_the_data(args):
    """
    Downloads GEOS-Chem data files from the ComputeCanada server
    or the AWS s3://gcgrid bucket.

    Args:
    -----
        args : dict
            Output of runction parse_args.
    """

    # Get information about the run
    run_info = get_run_info()

    # Get a unique list of data paths, both found and missing:
    # Expand the data paths to include links to restart files
    paths = extract_pathnames_from_log(args["dryrun_log"])
    paths = expand_restart_file_names(paths, run_info)

    # Write a list of unique file paths
    write_unique_paths(paths,
                       args["dryrun_log"] + ".unique")

    # Exit without downloading if skip-download lag was specified
    if args["skip_download"]:
        return

    # Print a message
    if args["from_aws"]:
        print("Downloading data from AWS")
    else:
        print("Downloading data from ComputeCanada")

    # Create script to download missing files from AWS S3
    create_download_script(paths, args["from_aws"])

    # Run the data download script and return the status
    # Remove the file afterwards
    status = subprocess.call(DATA_DOWNLOAD_SCRIPT)
    os.remove(DATA_DOWNLOAD_SCRIPT)

    # Raise an exception if the data was not successfully downloaded
    if status != 0:
        if args["from_aws"]:
            err_msg = "Error downloading data from AWS!"
        else:
            err_msg = "Error downloading data from ComputeCanada!"
        raise Exception(err_msg)


def parse_args():
    """
    Parses the arguments passed to the main program.

    Args:
    -----
        argv : list
            List of arguments passed from main().

    Returns:
    --------
        args : dict
            args["dryrun_log"]: Log file with dry-run output.
            args["from_aws"]: Download from AWS S3? (True/False)
            args["skip-download"]: Skip downloading and only write
               out the log with unique file names.
    """
    dryrun_log = ""
    from_aws = False
    skip_download = False

    for i in range(1, len(sys.argv)):
        if "AWS" in sys.argv[i].upper():
            from_aws = True
        elif "CC" in sys.argv[i].upper():
            from_aws = False
        elif "SKIP" in sys.argv[i].upper():
            skip_download = True
        else:
            dryrun_log = sys.argv[i]

    if len(dryrun_log) == 0:
        raise ValueError("Need to specify the log file with dryrun output!")

    return {"dryrun_log": dryrun_log,
            "from_aws": from_aws,
            "skip_download" : skip_download}


def main():
    """
    Main program.  Gets command-line arguments and calls function
    download_the_data to initiate a data-downloading process.

    Calling sequence:
    -----------------
        ./download_data.py log -aws            # from AWS
        ./download_data.py log -cc             # from ComputeCanada
        ./download_data.py log -skip-download  # Print unique log & exit
    """
    download_the_data(parse_args())


if __name__ == "__main__":
    main()