Welcome to the eocis_chuk_api documentation

Creating, loading and exporting CHUK data

`CHUKDataSetUtils`

Provide utility functions for working with CHUK datasets, based on xarray data structures.

This class helps users to work with CHUK datasets, providing support for creating and converting data and metadata

Examples:

>>> import xarray as xr
>>> import numpy as np
>>> from eocis_chuk_api import CHUKDataSetUtils
>>> utils = CHUKDataSetUtils("EOCIS-CHUK-GRID-100M-v1.0.nc")
>>> chuk_ds = utils.create_new_dataset(
     title="My CHUK dataset",
     institution = "EOCIS CHUK",
     Conventions = "CF-1.10",
     tracking_id = "12345678",
     product_version = "1.0",
     summary = "Shows estimates of the squirrel population in each CHUK grid cell",
     license = "Creative Commons Licence by attribution (https://creativecommons.org/licenses/by/4.0/)",
     history = "Developed from the squirrel population dataset",
     comment = "This is a made up example",
     creator_url = "https:///www.example.com",
     creator_name = "Institute of Squirrel Studies",
     creator_email = "enquiries@squirrel-studies.org.uk",
     creator_processing_institution = "Institute of Squirrel Studies")
>>> # create an array to hold the data
>>> population_data = np.zeros(utils.get_grid_shape())
>>> # populate the data
>>> population_data[...] = ...
>>> # attach the data
>>> chuk_ds["squirrel_population"] = xr.DataArray(population_data,dims=("y","x"), attrs={
    "long_name":"estimated_squirrel_population",
    "coordinates": "lat lon",
    "grid_mapping": "crsOSGB"
})
>>> # save the dataset
>>> utils.save(chuk_ds, "EOCIS-CHUK-L4-SQUIRRELPOP-MERGED-20231204-v0.1.nc")

Source code in eocis_chuk_api/chuk_dataset_utils.py

class CHUKDataSetUtils:
    """Provide utility functions for working with CHUK datasets, based on xarray data structures.

    This class helps users to work with CHUK datasets, providing support for creating and converting data and metadata

    Examples:

        >>> import xarray as xr
        >>> import numpy as np
        >>> from eocis_chuk_api import CHUKDataSetUtils
        >>> utils = CHUKDataSetUtils("EOCIS-CHUK-GRID-100M-v1.0.nc")
        >>> chuk_ds = utils.create_new_dataset(
             title="My CHUK dataset",
             institution = "EOCIS CHUK",
             Conventions = "CF-1.10",
             tracking_id = "12345678",
             product_version = "1.0",
             summary = "Shows estimates of the squirrel population in each CHUK grid cell",
             license = "Creative Commons Licence by attribution (https://creativecommons.org/licenses/by/4.0/)",
             history = "Developed from the squirrel population dataset",
             comment = "This is a made up example",
             creator_url = "https:///www.example.com",
             creator_name = "Institute of Squirrel Studies",
             creator_email = "enquiries@squirrel-studies.org.uk",
             creator_processing_institution = "Institute of Squirrel Studies")
        >>> # create an array to hold the data
        >>> population_data = np.zeros(utils.get_grid_shape())
        >>> # populate the data
        >>> population_data[...] = ...
        >>> # attach the data
        >>> chuk_ds["squirrel_population"] = xr.DataArray(population_data,dims=("y","x"), attrs={
            "long_name":"estimated_squirrel_population",
            "coordinates": "lat lon",
            "grid_mapping": "crsOSGB"
        })
        >>> # save the dataset
        >>> utils.save(chuk_ds, "EOCIS-CHUK-L4-SQUIRRELPOP-MERGED-20231204-v0.1.nc")

    """

    def __init__(self, chuk_grid_path: str):
        """
        Initialise an instance with the path to the CHUK grid file

        Args:
            chuk_grid_path: path to a grid file

        Notes:
            grid files can be obtained from https://gws-access.jasmin.ac.uk/public/nceo_uor/eocis-chuk/

        Examples:
            >>> from eocis_chuk_api import CHUKDataSetUtils
            >>> utils = CHUKDataSetUtils("EOCIS-CHUK-GRID-100M-v0.4.nc")
        """
        self.chuk_grid_ds = xr.open_dataset(chuk_grid_path)
        self.grid_resolution = int(self.chuk_grid_ds.x.data[1]) - int(self.chuk_grid_ds.x.data[0])

    def get_grid_latlons(self) -> (xarray.DataArray, xarray.DataArray):
        """
        Obtain the chuk grid lats/lons

        Returns:
            2-tuple containing xarray.DataArray objects (lats,lons)
        """
        return (self.chuk_grid_ds.lat, self.chuk_grid_ds.lon)

    def get_grid_shape(self) -> (int, int):
        """
        Obtain the chuk grid shape (y,x)

        Returns:
            2-tuple containing the grid (height, width)
        """
        return self.chuk_grid_ds.lat.shape

    def create_filename(self, project: str, processing_level: str, product_type: str, product_string: str,
                        datetime: str, version: str, additional_segregator: str = None, suffix: str = ".nc") -> str:
        """
        Create an EOCIS standards compliant filename

        Args:
            project: the EOCIS project string (see the appropriate standards doc)
            processing_level: specify the processing level in (L0, L1A, L1B, L1C, L2, L2P, L3, L3U, L3C, L3S, L4, IND)
            product_type: standardised term to describe the main product type in te dataset, see standards doc
            product_string: descriptive name chosen from the team, should not contain hyphens, can contain underscores
            datetime: date and optionally time, format YYYY[MM[DD[HH[MM[SS]]]]]
            version: File version number one or more digits followed by an optional "." and another one or more digits
            additional_segregator: an optional extra segregator, to be used if otherwise different data sets
                                   would generate the same filename
            suffix: the file suffix, including the "."

        Returns:
            Formatted filename
        """
        segregator = "" if additional_segregator is None else "-" + additional_segregator
        return f"EOCIS-{project}-{processing_level}-{product_type}-{product_string}{segregator}-{datetime}-fv{version}{suffix}"

    def create_new_dataset(self,
                           title: str = "",
                           include_lon_lat: bool = False,
                           institution: str = "EOCIS CHUK",
                           source: str = "",
                           history: str = "",
                           references: str = "",
                           tracking_id: str = "",
                           Conventions: str = "CF-1.10",
                           product_version: str = "",
                           format_version: str = "",
                           summary: str = "",
                           keywords: str = "",
                           id: str = "",
                           naming_authority: str = "",
                           keywords_vocabulary: str = "",
                           cdm_data_type: str = "",
                           comment: str = "",
                           date_created: str = "",
                           creator_name: str = "",
                           creator_url: str = "",
                           creator_email: str = "",
                           project: str = "Earth Observation Climate Information Service (EOCIS)",
                           geospatial_lat_min: str = "47.089",
                           geospatial_lat_max: str = "61.133",
                           geospatial_lon_min: str = "-15.374",
                           geospatial_lon_max: str = "4.750",
                           geospatial_vertical_min: str = "0",
                           geospatial_vertical_max: str = "0",
                           time_coverage_start: str = "",
                           time_coverage_end: str = "",
                           time_coverage_duration: str = "",
                           time_coverage_resolution: str = "",
                           standard_name_vocabulary: str = "",
                           license: str = "Creative Commons Attribution 4.0 International (CC-BY 4.0 license)",
                           platform: str = "",
                           sensor: str = "",
                           spatial_resolution: str = "100m",
                           geospatial_lat_units: str = "degrees_north",
                           geospatial_lon_units: str = "degrees_east",
                           geospatial_lon_resolution: str = "0.0009",
                           geospatial_lat_resolution: str = "0.00086",
                           key_variables: str = "",
                           acknowledgement: str = "Funded by UK EOCIS. Use of these data should acknowledge EOCIS",
                           publisher_url: str = "https://eocis.org",
                           publisher_name: str = "EOCIS",
                           publisher_email: str = "EOCIS@reading.ac.uk",
                           **other_attributes: dict) -> xarray.Dataset:
        """
        Create a new CHUK dataset with expected global attributes.

        Args:
            include_lon_lat: True if lon and lat 2d variables should be included
            title: a title for the dataset
            institution: Succinct description of the dataset
            source: Comma separated list of original data sources (+DOIs if available)
            history: Processing history of the dataset
            references: References to algorithm, ATBD, technical note describing dataset
            tracking_id: A UUID (Universal Unique Identifier) value
            Conventions: The CF Version e.g. CF-1.10
            product_version: The product version of this data file
            format_version: The EOCIS data format used e.g. “EOCIS Data Standards v1.x”
            summary: A paragraph describing the dataset
            keywords: A comma separated list of key words and phrases
            id: see naming_authority
            naming_authority: The combination of the naming authority and the id should be a globally unique identifier for the dataset
            keywords_vocabulary: If you are following a guideline for the words/phrases in your “keywords” attribute, put the name of that guideline here
            cdm_data_type: The THREDDS data type appropriate for this dataset
            comment: Miscellaneous information about the data
            date_created: The date on which the data was created
            creator_name: The person/organisation that created the data
            creator_url: A URL for the person/organisation that created the data
            creator_email: Contact email address for the person/organisation that created the data
            project: The scientific project that produced the data: “Earth Observation Climate Information Service (EOCIS)”
            geospatial_lat_min: Decimal degrees north, range -90 to +90
            geospatial_lat_max: Decimal degrees north, range -90 to +90
            geospatial_lon_min: Decimal degrees east, range -180 to +180
            geospatial_lon_max: Decimal degrees east, range -180 to +180
            geospatial_vertical_min: Assumed to be in metres above ground unless geospatial_vertical_units attribute defined otherwise
            geospatial_vertical_max: Assumed to be in metres above ground unless geospatial_vertical_units attribute defined otherwise
            time_coverage_start: Format yyyymmddThhmmssZ
            time_coverage_end: Format yyyymmddThhmmssZ
            time_coverage_duration: Should be an ISO8601 duration string, for example P1D
            time_coverage_resolution: Should be an ISO8601 duration string. For L2 data on the original satellite sampling it is acceptable to use 'satellite_orbit_frequency'
            standard_name_vocabulary: The name of the controlled vocabulary from which variable standard names are taken e.g. ‘CF Standard Name Table v82’
            license: Describe the restrictions to data access and distribution
            platform: Satellite name e.g. Sentinel-5. Separate lists by commas and use angled brackets for a platform series, e.g. ‘Envisat, NOAA-<12,14,16,17,18>, Metop-A’. The platform names used should follow the naming in the CCI controlled vocabulary
            sensor: Sensor name e.g. AATSR. Separate lists by commas.  The platform names used should follow the naming in the CCI controlled vocabulary
            spatial_resolution: A free-text string describing the approximate resolution of the product. For example, “1.1km at nadir”. This is intended to provide a useful indication to the user, so if more than one resolution is relevant e.g. the grid resolution and the data resolution, then both can be included.
            geospatial_lat_units: Geospatial latitude units used
            geospatial_lon_units: Geospatial longitude units used
            geospatial_lon_resolution: Geospatial latitude resolution used
            geospatial_lat_resolution: Geospatial longitude resolution used
            key_variables: A comma separated list of the key primary variables in the file i.e. those that have been scientifically validated.
            acknowledgement: Acknowledge funding sources and/or contributors
            other_attributes: any other attributes to include

        Returns:
            An xarray.Dataset object
        """

        attrs = {}
        self.__extend_attrs(attrs, "title", title, required=True)
        self.__extend_attrs(attrs, "institution", institution, required=True)
        self.__extend_attrs(attrs, "source", source)
        self.__extend_attrs(attrs, "history", history)
        self.__extend_attrs(attrs, "references", references)
        self.__extend_attrs(attrs, "tracking_id", tracking_id, required=True),
        self.__extend_attrs(attrs, "Conventions", Conventions)
        self.__extend_attrs(attrs, "product_version", product_version, required=True)
        self.__extend_attrs(attrs, "format_version", format_version)
        self.__extend_attrs(attrs, "summary", summary)
        self.__extend_attrs(attrs, "keywords", keywords)
        self.__extend_attrs(attrs, "id", id),
        self.__extend_attrs(attrs, "naming_authority", naming_authority),
        self.__extend_attrs(attrs, "keywords_vocabulary", keywords_vocabulary, required=False),
        self.__extend_attrs(attrs, "cdm_data_type", cdm_data_type),
        self.__extend_attrs(attrs, "comment", comment),
        self.__extend_attrs(attrs, "date_created", date_created),
        self.__extend_attrs(attrs, "creator_name", creator_name),
        self.__extend_attrs(attrs, "creator_url", creator_url),
        self.__extend_attrs(attrs, "creator_email", creator_email),
        self.__extend_attrs(attrs, "project", project),
        self.__extend_attrs(attrs, "geospatial_lat_min", geospatial_lat_min),
        self.__extend_attrs(attrs, "geospatial_lat_max", geospatial_lat_max),
        self.__extend_attrs(attrs, "geospatial_lon_min", geospatial_lon_min),
        self.__extend_attrs(attrs, "geospatial_lon_max", geospatial_lon_max),
        self.__extend_attrs(attrs, "geospatial_vertical_min", geospatial_vertical_min),
        self.__extend_attrs(attrs, "geospatial_vertical_max", geospatial_vertical_max),
        self.__extend_attrs(attrs, "time_coverage_start", time_coverage_start),
        self.__extend_attrs(attrs, "time_coverage_end", time_coverage_end),
        self.__extend_attrs(attrs, "time_coverage_duration", time_coverage_duration),
        self.__extend_attrs(attrs, "time_coverage_resolution", time_coverage_resolution),
        self.__extend_attrs(attrs, "standard_name_vocabulary", standard_name_vocabulary),
        self.__extend_attrs(attrs, "license", license),
        self.__extend_attrs(attrs, "platform", platform),
        self.__extend_attrs(attrs, "sensor", sensor),
        self.__extend_attrs(attrs, "spatial_resolution", spatial_resolution),
        self.__extend_attrs(attrs, "geospatial_lat_units", geospatial_lat_units),
        self.__extend_attrs(attrs, "geospatial_lon_units", geospatial_lon_units),
        self.__extend_attrs(attrs, "geospatial_lon_resolution", geospatial_lon_resolution),
        self.__extend_attrs(attrs, "geospatial_lat_resolution", geospatial_lat_resolution),
        self.__extend_attrs(attrs, "key_variables", key_variables)
        self.__extend_attrs(attrs, "acknowledgement", acknowledgement)
        self.__extend_attrs(attrs, "publisher_name", publisher_name),
        self.__extend_attrs(attrs, "publisher_url", publisher_url),
        self.__extend_attrs(attrs, "publisher_email", publisher_email),

        attrs.update(other_attributes)
        ds = xr.Dataset(attrs=attrs)
        # copy the grid definition from the grid file
        copyvars = ["x", "y", "x_bnds", "y_bnds", "crsOSGB"]
        if include_lon_lat:
            copyvars += ["lon", "lat"]
        for copyvar in copyvars:
            ds[copyvar] = self.chuk_grid_ds[copyvar]

        ds = ds.rio.write_crs("EPSG:27700", grid_mapping_name="crsOSGB")

        return ds

    def add_variable(self, to_dataset: xr.Dataset, data: np.array, variable_name: str, standard_name: str = None,
                     long_name: str = None, units: str = None, source: str = None, **other_attrs:dict):
        """
        Add a new variable to a dataset.  The dataset is updated in-place.

        Args:
            to_dataset: The xarray.Dataset to which the variable will be added
            data: a numpy array containing the data, organised by (y,x), (time,y,x) or (y,x,time)
            variable_name: the name of the variable to be added to the dataset
            standard_name: CF standard name (if appropriate)
            long_name:  A longer descriptive name of the variable
            units: units from UDUNITS
            other_attrs: dictionary containing other attributes to add to the new variable

        Raises:
            ValueError: if the data parameter does not match the expected shape
        """
        expected_shape = self.get_grid_shape()
        if len(data.shape) == 2:
            dims = ("y", "x")
            if data.shape != expected_shape:
                raise ValueError("Bad data shape, expecting: " + str(expected_shape) + " was: " + str(data.shape))
        else:
            if data.shape[1:] == expected_shape:
                dims = ("time", "y", "x")
            elif data.shape[1:] == expected_shape:
                dims = ("y", "x", "time")

        attrs = {
            "grid_mapping": "crsOSGB"
        }
        if standard_name is not None:
            attrs["standard_name"] = standard_name
        if long_name is not None:
            attrs["long_name"] = long_name
        if source is not None:
            attrs["source"] = source
        if units is not None:
            attrs["units"] = units
        attrs.update(other_attrs)

        to_dataset[variable_name] = xr.DataArray(data=data, dims=dims, attrs=attrs)

    def load(self, from_path: str, add_latlon: bool = False, add_latlon_bnds: bool = False) -> xarray.Dataset:
        """
        Load a CHUK dataset from file and return a dataset

        Args:
            from_path: path to a NetCDF4 file
            add_latlon: add lon and lat 2D arrays to the dataset
            add_latlon_bnds: add lon_bnds and lat_bnds 2D arrays to the dataset

        Returns:
            A dataset containing the loaded CHUK data
        """
        ds = xr.open_dataset(from_path, decode_coords="all")

        self.extend_latlon(ds, add_latlon=add_latlon, add_latlon_bnds=add_latlon_bnds)

        return ds

    def save(self, ds: xarray.Dataset, to_path: str, add_latlon: bool = False, add_latlon_bnds: bool = False,
             x_chunk_size: int = 1000, y_chunk_size: int = 1000,
             time_chunk_size: int = 1, custom_encodings: dict = {}, override_encodings: dict={}):
        """
        Save a CHUK dataset to file, applying the standard chunking and compression

        Args:
            ds: an xarray dataset containing CHUK data
            to_path: path to a NetCDF4 file
            add_latlon: add lon and lat 2D arrays to the dataset
            add_latlon_bnds: add lon_bnds and lat_bnds 2D arrays to the dataset
            x_chunk_size: size of chunking in the x-dimension
            y_chunk_size: size of chunking in the x-dimension
            time_chunk_size: size of chunking in the time dimension
            custom_encodings: dictionary mapping from variable names to a custom encoding to use by xarray
        """

        encodings = {}

        for v in ds.variables:
            if custom_encodings and v in custom_encodings:
                encodings[v] = custom_encodings[v]
            else:
                dims = ds[v].dims
                if "x" in dims and "y" in dims:

                    encodings[v] = {
                        "zlib": True,
                        "complevel": 5
                    }

                    if v in override_encodings:
                        for (name,value) in override_encodings[v].items():
                            if value is None:
                                if name in encodings:
                                    del encodings[v][name]
                            else:
                                encodings[v][name] = value

                    chunk_sizes = []
                    for d in dims:
                        if d == "y":
                            chunk_sizes.append(y_chunk_size)
                        elif d == "x":
                            chunk_sizes.append(x_chunk_size)
                        elif d == "time":
                            chunk_sizes.append(time_chunk_size)
                        else:
                            chunk_sizes.append(0)
                    encodings[v]["chunksizes"] = chunk_sizes

        self.extend_latlon(ds, add_latlon=add_latlon, add_latlon_bnds=add_latlon_bnds)


        # ds = ds.rio.write_crs("EPSG:27700",grid_mapping_name="crsOSGB")

        ds.to_netcdf(to_path, encoding=encodings)

    def extend_latlon(self, ds, add_latlon=False, add_latlon_bnds=False):
        if add_latlon:
            self.add_latlon(ds)

        if add_latlon_bnds:
            self.add_latlon_bnds(ds)

        if add_latlon:
            for v in ["lat", "lon"]:
                # remove bounds if no such variable exists
                bounds = ds[v].attrs.get("bounds",None)
                if bounds and bounds not in ds.variables:
                    del ds[v].attrs["bounds"]

    def check(self, ds: xarray.Dataset) -> ([(str, str)], [(str, str)]):
        """
        Check a dataset against CHUK format, returning details of any problems found

        Args:
            ds: the xarray.Dataset to check

        Returns:
            2-tuple (warnings, errors) containing lists of (code,detail) tuples
        """

        # perform metadata checks
        warnings, errors = CHUKMetadata.check(ds)

        # check the dimensions are correct, compared to the grid
        for v in ["x", "y"]:
            actual_shape = ds[v].shape
            expected_shape = self.chuk_grid_ds[v].shape
            if actual_shape != expected_shape:
                errors.append(("bad_shape", (v, actual_shape, expected_shape)))

        return warnings, errors

    @staticmethod
    def sample(ds: xarray.Dataset, to_resolution: int) -> xarray.Dataset:
        """
        Create a lower resolution sample of a CHUK dataset

        Args:
            ds: the xarray.Dataset containing CHUK data to sample
            to_resolution: the resolution for the sampled output, must be a multiple of 100

        Returns:
            A dataset containing the sampled data
        """
        if to_resolution % 100 != 0:
            raise ValueError(f"Error - resolution requested ({to_resolution}) is not a multiple of 100")
        sample_step = int(to_resolution / 100)
        return ds.isel(x=slice(0, -1, sample_step), y=slice(0, -1, sample_step))

    def add_latlon(self, ds: xarray.Dataset):
        """
        Add lat and lon 2D arrays from the reference grid

        Args:
            ds: the dataset to mondify in-place
        """
        ds["lon"] = self.chuk_grid_ds["lon"]
        ds["lat"] = self.chuk_grid_ds["lat"]

    def add_latlon_bnds(self, ds: xarray.Dataset):
        """
        Add lat and lon 2D bounds from the reference grid

        Args:
           ds: the dataset to mondify in-place
        """
        ds["lon_bnds"] = self.chuk_grid_ds["lon_bnds"]
        ds["lat_bnds"] = self.chuk_grid_ds["lat_bnds"]

    @staticmethod
    def save_as_geotif(ds: xarray.Dataset, variable_name: str, to_path: str):
        """
        Save a CHUK dataset to a geotiff.  DEPRECATED - use save_as_geotiff

        Args:
            ds: the CHUK dataset
            variable_name: the name of the variable to save from the dataset
            to_path: the path to save the geotiff file to
        """
        return CHUKDataSetUtils.save_as_geotiff(ds, variable_name, to_path)

    @staticmethod
    def save_as_geotiff(ds: xarray.Dataset, variable_name: str, to_path: str):
        """
        Save a CHUK dataset to a geotiff

        Args:
            ds: the CHUK dataset
            variable_name: the name of the variable to save from the dataset
            to_path: the path to save the geotiff file to
        """
        ds_crs = ds.rio.write_crs("EPSG:27700")
        if "grid_mapping" in ds_crs[variable_name].attrs:
            # this seems to cause a problem, why?
            del ds_crs[variable_name].attrs["grid_mapping"]
        tags = CHUKMetadata.to_json(ds_crs, variable_name)
        ds_crs[variable_name].rio.to_raster(to_path, tags=tags, driver="COG")

    def __extend_attrs(self, attrs, key, value, required=None):
        if required and value == "" or value is None:
            raise ValueError(f"attribute {key} is required")
        else:
            if value == "" or value is None:
                return
            attrs[key] = value

`init(chuk_grid_path)`

Initialise an instance with the path to the CHUK grid file

Parameters:

Name	Type	Description	Default
`chuk_grid_path`	`str`	path to a grid file	required

Notes

grid files can be obtained from https://gws-access.jasmin.ac.uk/public/nceo_uor/eocis-chuk/

Examples:

>>> from eocis_chuk_api import CHUKDataSetUtils
>>> utils = CHUKDataSetUtils("EOCIS-CHUK-GRID-100M-v0.4.nc")

Source code in eocis_chuk_api/chuk_dataset_utils.py

def __init__(self, chuk_grid_path: str):
    """
    Initialise an instance with the path to the CHUK grid file

    Args:
        chuk_grid_path: path to a grid file

    Notes:
        grid files can be obtained from https://gws-access.jasmin.ac.uk/public/nceo_uor/eocis-chuk/

    Examples:
        >>> from eocis_chuk_api import CHUKDataSetUtils
        >>> utils = CHUKDataSetUtils("EOCIS-CHUK-GRID-100M-v0.4.nc")
    """
    self.chuk_grid_ds = xr.open_dataset(chuk_grid_path)
    self.grid_resolution = int(self.chuk_grid_ds.x.data[1]) - int(self.chuk_grid_ds.x.data[0])

`add_latlon(ds)`

Add lat and lon 2D arrays from the reference grid

Parameters:

Name	Type	Description	Default
`ds`	`Dataset`	the dataset to mondify in-place	required

Source code in eocis_chuk_api/chuk_dataset_utils.py

def add_latlon(self, ds: xarray.Dataset):
    """
    Add lat and lon 2D arrays from the reference grid

    Args:
        ds: the dataset to mondify in-place
    """
    ds["lon"] = self.chuk_grid_ds["lon"]
    ds["lat"] = self.chuk_grid_ds["lat"]

`add_latlon_bnds(ds)`

Add lat and lon 2D bounds from the reference grid

Parameters:

Name	Type	Description	Default
`ds`	`Dataset`	the dataset to mondify in-place	required

Source code in eocis_chuk_api/chuk_dataset_utils.py

def add_latlon_bnds(self, ds: xarray.Dataset):
    """
    Add lat and lon 2D bounds from the reference grid

    Args:
       ds: the dataset to mondify in-place
    """
    ds["lon_bnds"] = self.chuk_grid_ds["lon_bnds"]
    ds["lat_bnds"] = self.chuk_grid_ds["lat_bnds"]

`add_variable(to_dataset, data, variable_name, standard_name=None, long_name=None, units=None, source=None, **other_attrs)`

Add a new variable to a dataset. The dataset is updated in-place.

Parameters:

Name	Type	Description	Default
`to_dataset`	`Dataset`	The xarray.Dataset to which the variable will be added	required
`data`	`array`	a numpy array containing the data, organised by (y,x), (time,y,x) or (y,x,time)	required
`variable_name`	`str`	the name of the variable to be added to the dataset	required
`standard_name`	`str`	CF standard name (if appropriate)	`None`
`long_name`	`str`	A longer descriptive name of the variable	`None`
`units`	`str`	units from UDUNITS	`None`
`other_attrs`	`dict`	dictionary containing other attributes to add to the new variable	`{}`

Raises:

Type	Description
`ValueError`	if the data parameter does not match the expected shape

Source code in eocis_chuk_api/chuk_dataset_utils.py

def add_variable(self, to_dataset: xr.Dataset, data: np.array, variable_name: str, standard_name: str = None,
                 long_name: str = None, units: str = None, source: str = None, **other_attrs:dict):
    """
    Add a new variable to a dataset.  The dataset is updated in-place.

    Args:
        to_dataset: The xarray.Dataset to which the variable will be added
        data: a numpy array containing the data, organised by (y,x), (time,y,x) or (y,x,time)
        variable_name: the name of the variable to be added to the dataset
        standard_name: CF standard name (if appropriate)
        long_name:  A longer descriptive name of the variable
        units: units from UDUNITS
        other_attrs: dictionary containing other attributes to add to the new variable

    Raises:
        ValueError: if the data parameter does not match the expected shape
    """
    expected_shape = self.get_grid_shape()
    if len(data.shape) == 2:
        dims = ("y", "x")
        if data.shape != expected_shape:
            raise ValueError("Bad data shape, expecting: " + str(expected_shape) + " was: " + str(data.shape))
    else:
        if data.shape[1:] == expected_shape:
            dims = ("time", "y", "x")
        elif data.shape[1:] == expected_shape:
            dims = ("y", "x", "time")

    attrs = {
        "grid_mapping": "crsOSGB"
    }
    if standard_name is not None:
        attrs["standard_name"] = standard_name
    if long_name is not None:
        attrs["long_name"] = long_name
    if source is not None:
        attrs["source"] = source
    if units is not None:
        attrs["units"] = units
    attrs.update(other_attrs)

    to_dataset[variable_name] = xr.DataArray(data=data, dims=dims, attrs=attrs)

`check(ds)`

Check a dataset against CHUK format, returning details of any problems found

Parameters:

Name	Type	Description	Default
`ds`	`Dataset`	the xarray.Dataset to check	required

Returns:

Type	Description
`([(str, str)], [(str, str)])`	2-tuple (warnings, errors) containing lists of (code,detail) tuples

Source code in eocis_chuk_api/chuk_dataset_utils.py

def check(self, ds: xarray.Dataset) -> ([(str, str)], [(str, str)]):
    """
    Check a dataset against CHUK format, returning details of any problems found

    Args:
        ds: the xarray.Dataset to check

    Returns:
        2-tuple (warnings, errors) containing lists of (code,detail) tuples
    """

    # perform metadata checks
    warnings, errors = CHUKMetadata.check(ds)

    # check the dimensions are correct, compared to the grid
    for v in ["x", "y"]:
        actual_shape = ds[v].shape
        expected_shape = self.chuk_grid_ds[v].shape
        if actual_shape != expected_shape:
            errors.append(("bad_shape", (v, actual_shape, expected_shape)))

    return warnings, errors

`create_filename(project, processing_level, product_type, product_string, datetime, version, additional_segregator=None, suffix='.nc')`

Create an EOCIS standards compliant filename

Parameters:

Name	Type	Description	Default
`project`	`str`	the EOCIS project string (see the appropriate standards doc)	required
`processing_level`	`str`	specify the processing level in (L0, L1A, L1B, L1C, L2, L2P, L3, L3U, L3C, L3S, L4, IND)	required
`product_type`	`str`	standardised term to describe the main product type in te dataset, see standards doc	required
`product_string`	`str`	descriptive name chosen from the team, should not contain hyphens, can contain underscores	required
`datetime`	`str`	date and optionally time, format YYYY[MM[DD[HH[MM[SS]]]]]	required
`version`	`str`	File version number one or more digits followed by an optional "." and another one or more digits	required
`additional_segregator`	`str`	an optional extra segregator, to be used if otherwise different data sets would generate the same filename	`None`
`suffix`	`str`	the file suffix, including the "."	`'.nc'`

Returns:

Type	Description
`str`	Formatted filename

Source code in eocis_chuk_api/chuk_dataset_utils.py

def create_filename(self, project: str, processing_level: str, product_type: str, product_string: str,
                    datetime: str, version: str, additional_segregator: str = None, suffix: str = ".nc") -> str:
    """
    Create an EOCIS standards compliant filename

    Args:
        project: the EOCIS project string (see the appropriate standards doc)
        processing_level: specify the processing level in (L0, L1A, L1B, L1C, L2, L2P, L3, L3U, L3C, L3S, L4, IND)
        product_type: standardised term to describe the main product type in te dataset, see standards doc
        product_string: descriptive name chosen from the team, should not contain hyphens, can contain underscores
        datetime: date and optionally time, format YYYY[MM[DD[HH[MM[SS]]]]]
        version: File version number one or more digits followed by an optional "." and another one or more digits
        additional_segregator: an optional extra segregator, to be used if otherwise different data sets
                               would generate the same filename
        suffix: the file suffix, including the "."

    Returns:
        Formatted filename
    """
    segregator = "" if additional_segregator is None else "-" + additional_segregator
    return f"EOCIS-{project}-{processing_level}-{product_type}-{product_string}{segregator}-{datetime}-fv{version}{suffix}"

create_new_dataset(title='', include_lon_lat=False, institution='EOCIS CHUK', source='', history='', references='', tracking_id='', Conventions='CF-1.10', product_version='', format_version='', summary='', keywords='', id='', naming_authority='', keywords_vocabulary='', cdm_data_type='', comment='', date_created='', creator_name='', creator_url='', creator_email='', project='Earth Observation Climate Information Service (EOCIS)', geospatial_lat_min='47.089', geospatial_lat_max='61.133', geospatial_lon_min='-15.374', geospatial_lon_max='4.750', geospatial_vertical_min='0', geospatial_vertical_max='0', time_coverage_start='', time_coverage_end='', time_coverage_duration='', time_coverage_resolution='', standard_name_vocabulary='', license='Creative Commons Attribution 4.0 International (CC-BY 4.0 license)', platform='', sensor='', spatial_resolution='100m', geospatial_lat_units='degrees_north', geospatial_lon_units='degrees_east', geospatial_lon_resolution='0.0009', geospatial_lat_resolution='0.00086', key_variables='', acknowledgement='Funded by UK EOCIS. Use of these data should acknowledge EOCIS', publisher_url='https://eocis.org', publisher_name='EOCIS', publisher_email='EOCIS@reading.ac.uk', **other_attributes)

Create a new CHUK dataset with expected global attributes.

Parameters:

Name	Type	Description	Default
`include_lon_lat`	`bool`	True if lon and lat 2d variables should be included	`False`
`title`	`str`	a title for the dataset	`''`
`institution`	`str`	Succinct description of the dataset	`'EOCIS CHUK'`
`source`	`str`	Comma separated list of original data sources (+DOIs if available)	`''`
`history`	`str`	Processing history of the dataset	`''`
`references`	`str`	References to algorithm, ATBD, technical note describing dataset	`''`
`tracking_id`	`str`	A UUID (Universal Unique Identifier) value	`''`
`Conventions`	`str`	The CF Version e.g. CF-1.10	`'CF-1.10'`
`product_version`	`str`	The product version of this data file	`''`
`format_version`	`str`	The EOCIS data format used e.g. “EOCIS Data Standards v1.x”	`''`
`summary`	`str`	A paragraph describing the dataset	`''`
`keywords`	`str`	A comma separated list of key words and phrases	`''`
`id`	`str`	see naming_authority	`''`
`naming_authority`	`str`	The combination of the naming authority and the id should be a globally unique identifier for the dataset	`''`
`keywords_vocabulary`	`str`	If you are following a guideline for the words/phrases in your “keywords” attribute, put the name of that guideline here	`''`
`cdm_data_type`	`str`	The THREDDS data type appropriate for this dataset	`''`
`comment`	`str`	Miscellaneous information about the data	`''`
`date_created`	`str`	The date on which the data was created	`''`
`creator_name`	`str`	The person/organisation that created the data	`''`
`creator_url`	`str`	A URL for the person/organisation that created the data	`''`
`creator_email`	`str`	Contact email address for the person/organisation that created the data	`''`
`project`	`str`	The scientific project that produced the data: “Earth Observation Climate Information Service (EOCIS)”	`'Earth Observation Climate Information Service (EOCIS)'`
`geospatial_lat_min`	`str`	Decimal degrees north, range -90 to +90	`'47.089'`
`geospatial_lat_max`	`str`	Decimal degrees north, range -90 to +90	`'61.133'`
`geospatial_lon_min`	`str`	Decimal degrees east, range -180 to +180	`'-15.374'`
`geospatial_lon_max`	`str`	Decimal degrees east, range -180 to +180	`'4.750'`
`geospatial_vertical_min`	`str`	Assumed to be in metres above ground unless geospatial_vertical_units attribute defined otherwise	`'0'`
`geospatial_vertical_max`	`str`	Assumed to be in metres above ground unless geospatial_vertical_units attribute defined otherwise	`'0'`
`time_coverage_start`	`str`	Format yyyymmddThhmmssZ	`''`
`time_coverage_end`	`str`	Format yyyymmddThhmmssZ	`''`
`time_coverage_duration`	`str`	Should be an ISO8601 duration string, for example P1D	`''`
`time_coverage_resolution`	`str`	Should be an ISO8601 duration string. For L2 data on the original satellite sampling it is acceptable to use 'satellite_orbit_frequency'	`''`
`standard_name_vocabulary`	`str`	The name of the controlled vocabulary from which variable standard names are taken e.g. ‘CF Standard Name Table v82’	`''`
`license`	`str`	Describe the restrictions to data access and distribution	`'Creative Commons Attribution 4.0 International (CC-BY 4.0 license)'`
`platform`	`str`	Satellite name e.g. Sentinel-5. Separate lists by commas and use angled brackets for a platform series, e.g. ‘Envisat, NOAA-<12,14,16,17,18>, Metop-A’. The platform names used should follow the naming in the CCI controlled vocabulary	`''`
`sensor`	`str`	Sensor name e.g. AATSR. Separate lists by commas. The platform names used should follow the naming in the CCI controlled vocabulary	`''`
`spatial_resolution`	`str`	A free-text string describing the approximate resolution of the product. For example, “1.1km at nadir”. This is intended to provide a useful indication to the user, so if more than one resolution is relevant e.g. the grid resolution and the data resolution, then both can be included.	`'100m'`
`geospatial_lat_units`	`str`	Geospatial latitude units used	`'degrees_north'`
`geospatial_lon_units`	`str`	Geospatial longitude units used	`'degrees_east'`
`geospatial_lon_resolution`	`str`	Geospatial latitude resolution used	`'0.0009'`
`geospatial_lat_resolution`	`str`	Geospatial longitude resolution used	`'0.00086'`
`key_variables`	`str`	A comma separated list of the key primary variables in the file i.e. those that have been scientifically validated.	`''`
`acknowledgement`	`str`	Acknowledge funding sources and/or contributors	`'Funded by UK EOCIS. Use of these data should acknowledge EOCIS'`
`other_attributes`	`dict`	any other attributes to include	`{}`

Returns:

Type	Description
`Dataset`	An xarray.Dataset object

Source code in eocis_chuk_api/chuk_dataset_utils.py

def create_new_dataset(self,
                       title: str = "",
                       include_lon_lat: bool = False,
                       institution: str = "EOCIS CHUK",
                       source: str = "",
                       history: str = "",
                       references: str = "",
                       tracking_id: str = "",
                       Conventions: str = "CF-1.10",
                       product_version: str = "",
                       format_version: str = "",
                       summary: str = "",
                       keywords: str = "",
                       id: str = "",
                       naming_authority: str = "",
                       keywords_vocabulary: str = "",
                       cdm_data_type: str = "",
                       comment: str = "",
                       date_created: str = "",
                       creator_name: str = "",
                       creator_url: str = "",
                       creator_email: str = "",
                       project: str = "Earth Observation Climate Information Service (EOCIS)",
                       geospatial_lat_min: str = "47.089",
                       geospatial_lat_max: str = "61.133",
                       geospatial_lon_min: str = "-15.374",
                       geospatial_lon_max: str = "4.750",
                       geospatial_vertical_min: str = "0",
                       geospatial_vertical_max: str = "0",
                       time_coverage_start: str = "",
                       time_coverage_end: str = "",
                       time_coverage_duration: str = "",
                       time_coverage_resolution: str = "",
                       standard_name_vocabulary: str = "",
                       license: str = "Creative Commons Attribution 4.0 International (CC-BY 4.0 license)",
                       platform: str = "",
                       sensor: str = "",
                       spatial_resolution: str = "100m",
                       geospatial_lat_units: str = "degrees_north",
                       geospatial_lon_units: str = "degrees_east",
                       geospatial_lon_resolution: str = "0.0009",
                       geospatial_lat_resolution: str = "0.00086",
                       key_variables: str = "",
                       acknowledgement: str = "Funded by UK EOCIS. Use of these data should acknowledge EOCIS",
                       publisher_url: str = "https://eocis.org",
                       publisher_name: str = "EOCIS",
                       publisher_email: str = "EOCIS@reading.ac.uk",
                       **other_attributes: dict) -> xarray.Dataset:
    """
    Create a new CHUK dataset with expected global attributes.

    Args:
        include_lon_lat: True if lon and lat 2d variables should be included
        title: a title for the dataset
        institution: Succinct description of the dataset
        source: Comma separated list of original data sources (+DOIs if available)
        history: Processing history of the dataset
        references: References to algorithm, ATBD, technical note describing dataset
        tracking_id: A UUID (Universal Unique Identifier) value
        Conventions: The CF Version e.g. CF-1.10
        product_version: The product version of this data file
        format_version: The EOCIS data format used e.g. “EOCIS Data Standards v1.x”
        summary: A paragraph describing the dataset
        keywords: A comma separated list of key words and phrases
        id: see naming_authority
        naming_authority: The combination of the naming authority and the id should be a globally unique identifier for the dataset
        keywords_vocabulary: If you are following a guideline for the words/phrases in your “keywords” attribute, put the name of that guideline here
        cdm_data_type: The THREDDS data type appropriate for this dataset
        comment: Miscellaneous information about the data
        date_created: The date on which the data was created
        creator_name: The person/organisation that created the data
        creator_url: A URL for the person/organisation that created the data
        creator_email: Contact email address for the person/organisation that created the data
        project: The scientific project that produced the data: “Earth Observation Climate Information Service (EOCIS)”
        geospatial_lat_min: Decimal degrees north, range -90 to +90
        geospatial_lat_max: Decimal degrees north, range -90 to +90
        geospatial_lon_min: Decimal degrees east, range -180 to +180
        geospatial_lon_max: Decimal degrees east, range -180 to +180
        geospatial_vertical_min: Assumed to be in metres above ground unless geospatial_vertical_units attribute defined otherwise
        geospatial_vertical_max: Assumed to be in metres above ground unless geospatial_vertical_units attribute defined otherwise
        time_coverage_start: Format yyyymmddThhmmssZ
        time_coverage_end: Format yyyymmddThhmmssZ
        time_coverage_duration: Should be an ISO8601 duration string, for example P1D
        time_coverage_resolution: Should be an ISO8601 duration string. For L2 data on the original satellite sampling it is acceptable to use 'satellite_orbit_frequency'
        standard_name_vocabulary: The name of the controlled vocabulary from which variable standard names are taken e.g. ‘CF Standard Name Table v82’
        license: Describe the restrictions to data access and distribution
        platform: Satellite name e.g. Sentinel-5. Separate lists by commas and use angled brackets for a platform series, e.g. ‘Envisat, NOAA-<12,14,16,17,18>, Metop-A’. The platform names used should follow the naming in the CCI controlled vocabulary
        sensor: Sensor name e.g. AATSR. Separate lists by commas.  The platform names used should follow the naming in the CCI controlled vocabulary
        spatial_resolution: A free-text string describing the approximate resolution of the product. For example, “1.1km at nadir”. This is intended to provide a useful indication to the user, so if more than one resolution is relevant e.g. the grid resolution and the data resolution, then both can be included.
        geospatial_lat_units: Geospatial latitude units used
        geospatial_lon_units: Geospatial longitude units used
        geospatial_lon_resolution: Geospatial latitude resolution used
        geospatial_lat_resolution: Geospatial longitude resolution used
        key_variables: A comma separated list of the key primary variables in the file i.e. those that have been scientifically validated.
        acknowledgement: Acknowledge funding sources and/or contributors
        other_attributes: any other attributes to include

    Returns:
        An xarray.Dataset object
    """

    attrs = {}
    self.__extend_attrs(attrs, "title", title, required=True)
    self.__extend_attrs(attrs, "institution", institution, required=True)
    self.__extend_attrs(attrs, "source", source)
    self.__extend_attrs(attrs, "history", history)
    self.__extend_attrs(attrs, "references", references)
    self.__extend_attrs(attrs, "tracking_id", tracking_id, required=True),
    self.__extend_attrs(attrs, "Conventions", Conventions)
    self.__extend_attrs(attrs, "product_version", product_version, required=True)
    self.__extend_attrs(attrs, "format_version", format_version)
    self.__extend_attrs(attrs, "summary", summary)
    self.__extend_attrs(attrs, "keywords", keywords)
    self.__extend_attrs(attrs, "id", id),
    self.__extend_attrs(attrs, "naming_authority", naming_authority),
    self.__extend_attrs(attrs, "keywords_vocabulary", keywords_vocabulary, required=False),
    self.__extend_attrs(attrs, "cdm_data_type", cdm_data_type),
    self.__extend_attrs(attrs, "comment", comment),
    self.__extend_attrs(attrs, "date_created", date_created),
    self.__extend_attrs(attrs, "creator_name", creator_name),
    self.__extend_attrs(attrs, "creator_url", creator_url),
    self.__extend_attrs(attrs, "creator_email", creator_email),
    self.__extend_attrs(attrs, "project", project),
    self.__extend_attrs(attrs, "geospatial_lat_min", geospatial_lat_min),
    self.__extend_attrs(attrs, "geospatial_lat_max", geospatial_lat_max),
    self.__extend_attrs(attrs, "geospatial_lon_min", geospatial_lon_min),
    self.__extend_attrs(attrs, "geospatial_lon_max", geospatial_lon_max),
    self.__extend_attrs(attrs, "geospatial_vertical_min", geospatial_vertical_min),
    self.__extend_attrs(attrs, "geospatial_vertical_max", geospatial_vertical_max),
    self.__extend_attrs(attrs, "time_coverage_start", time_coverage_start),
    self.__extend_attrs(attrs, "time_coverage_end", time_coverage_end),
    self.__extend_attrs(attrs, "time_coverage_duration", time_coverage_duration),
    self.__extend_attrs(attrs, "time_coverage_resolution", time_coverage_resolution),
    self.__extend_attrs(attrs, "standard_name_vocabulary", standard_name_vocabulary),
    self.__extend_attrs(attrs, "license", license),
    self.__extend_attrs(attrs, "platform", platform),
    self.__extend_attrs(attrs, "sensor", sensor),
    self.__extend_attrs(attrs, "spatial_resolution", spatial_resolution),
    self.__extend_attrs(attrs, "geospatial_lat_units", geospatial_lat_units),
    self.__extend_attrs(attrs, "geospatial_lon_units", geospatial_lon_units),
    self.__extend_attrs(attrs, "geospatial_lon_resolution", geospatial_lon_resolution),
    self.__extend_attrs(attrs, "geospatial_lat_resolution", geospatial_lat_resolution),
    self.__extend_attrs(attrs, "key_variables", key_variables)
    self.__extend_attrs(attrs, "acknowledgement", acknowledgement)
    self.__extend_attrs(attrs, "publisher_name", publisher_name),
    self.__extend_attrs(attrs, "publisher_url", publisher_url),
    self.__extend_attrs(attrs, "publisher_email", publisher_email),

    attrs.update(other_attributes)
    ds = xr.Dataset(attrs=attrs)
    # copy the grid definition from the grid file
    copyvars = ["x", "y", "x_bnds", "y_bnds", "crsOSGB"]
    if include_lon_lat:
        copyvars += ["lon", "lat"]
    for copyvar in copyvars:
        ds[copyvar] = self.chuk_grid_ds[copyvar]

    ds = ds.rio.write_crs("EPSG:27700", grid_mapping_name="crsOSGB")

    return ds

`get_grid_latlons()`

Obtain the chuk grid lats/lons

Returns:

Type	Description
`(DataArray, DataArray)`	2-tuple containing xarray.DataArray objects (lats,lons)

Source code in eocis_chuk_api/chuk_dataset_utils.py

def get_grid_latlons(self) -> (xarray.DataArray, xarray.DataArray):
    """
    Obtain the chuk grid lats/lons

    Returns:
        2-tuple containing xarray.DataArray objects (lats,lons)
    """
    return (self.chuk_grid_ds.lat, self.chuk_grid_ds.lon)

`get_grid_shape()`

Obtain the chuk grid shape (y,x)

Returns:

Type	Description
`(int, int)`	2-tuple containing the grid (height, width)

Source code in eocis_chuk_api/chuk_dataset_utils.py

def get_grid_shape(self) -> (int, int):
    """
    Obtain the chuk grid shape (y,x)

    Returns:
        2-tuple containing the grid (height, width)
    """
    return self.chuk_grid_ds.lat.shape

`load(from_path, add_latlon=False, add_latlon_bnds=False)`

Load a CHUK dataset from file and return a dataset

Parameters:

Name	Type	Description	Default
`from_path`	`str`	path to a NetCDF4 file	required
`add_latlon`	`bool`	add lon and lat 2D arrays to the dataset	`False`
`add_latlon_bnds`	`bool`	add lon_bnds and lat_bnds 2D arrays to the dataset	`False`

Returns:

Type	Description
`Dataset`	A dataset containing the loaded CHUK data

Source code in eocis_chuk_api/chuk_dataset_utils.py

def load(self, from_path: str, add_latlon: bool = False, add_latlon_bnds: bool = False) -> xarray.Dataset:
    """
    Load a CHUK dataset from file and return a dataset

    Args:
        from_path: path to a NetCDF4 file
        add_latlon: add lon and lat 2D arrays to the dataset
        add_latlon_bnds: add lon_bnds and lat_bnds 2D arrays to the dataset

    Returns:
        A dataset containing the loaded CHUK data
    """
    ds = xr.open_dataset(from_path, decode_coords="all")

    self.extend_latlon(ds, add_latlon=add_latlon, add_latlon_bnds=add_latlon_bnds)

    return ds

`sample(ds, to_resolution)` `staticmethod`

Create a lower resolution sample of a CHUK dataset

Parameters:

Name	Type	Description	Default
`ds`	`Dataset`	the xarray.Dataset containing CHUK data to sample	required
`to_resolution`	`int`	the resolution for the sampled output, must be a multiple of 100	required

Returns:

Type	Description
`Dataset`	A dataset containing the sampled data

Source code in eocis_chuk_api/chuk_dataset_utils.py

@staticmethod
def sample(ds: xarray.Dataset, to_resolution: int) -> xarray.Dataset:
    """
    Create a lower resolution sample of a CHUK dataset

    Args:
        ds: the xarray.Dataset containing CHUK data to sample
        to_resolution: the resolution for the sampled output, must be a multiple of 100

    Returns:
        A dataset containing the sampled data
    """
    if to_resolution % 100 != 0:
        raise ValueError(f"Error - resolution requested ({to_resolution}) is not a multiple of 100")
    sample_step = int(to_resolution / 100)
    return ds.isel(x=slice(0, -1, sample_step), y=slice(0, -1, sample_step))

`save(ds, to_path, add_latlon=False, add_latlon_bnds=False, x_chunk_size=1000, y_chunk_size=1000, time_chunk_size=1, custom_encodings={}, override_encodings={})`

Save a CHUK dataset to file, applying the standard chunking and compression

Parameters:

Name	Type	Description	Default
`ds`	`Dataset`	an xarray dataset containing CHUK data	required
`to_path`	`str`	path to a NetCDF4 file	required
`add_latlon`	`bool`	add lon and lat 2D arrays to the dataset	`False`
`add_latlon_bnds`	`bool`	add lon_bnds and lat_bnds 2D arrays to the dataset	`False`
`x_chunk_size`	`int`	size of chunking in the x-dimension	`1000`
`y_chunk_size`	`int`	size of chunking in the x-dimension	`1000`
`time_chunk_size`	`int`	size of chunking in the time dimension	`1`
`custom_encodings`	`dict`	dictionary mapping from variable names to a custom encoding to use by xarray	`{}`

Source code in eocis_chuk_api/chuk_dataset_utils.py

def save(self, ds: xarray.Dataset, to_path: str, add_latlon: bool = False, add_latlon_bnds: bool = False,
         x_chunk_size: int = 1000, y_chunk_size: int = 1000,
         time_chunk_size: int = 1, custom_encodings: dict = {}, override_encodings: dict={}):
    """
    Save a CHUK dataset to file, applying the standard chunking and compression

    Args:
        ds: an xarray dataset containing CHUK data
        to_path: path to a NetCDF4 file
        add_latlon: add lon and lat 2D arrays to the dataset
        add_latlon_bnds: add lon_bnds and lat_bnds 2D arrays to the dataset
        x_chunk_size: size of chunking in the x-dimension
        y_chunk_size: size of chunking in the x-dimension
        time_chunk_size: size of chunking in the time dimension
        custom_encodings: dictionary mapping from variable names to a custom encoding to use by xarray
    """

    encodings = {}

    for v in ds.variables:
        if custom_encodings and v in custom_encodings:
            encodings[v] = custom_encodings[v]
        else:
            dims = ds[v].dims
            if "x" in dims and "y" in dims:

                encodings[v] = {
                    "zlib": True,
                    "complevel": 5
                }

                if v in override_encodings:
                    for (name,value) in override_encodings[v].items():
                        if value is None:
                            if name in encodings:
                                del encodings[v][name]
                        else:
                            encodings[v][name] = value

                chunk_sizes = []
                for d in dims:
                    if d == "y":
                        chunk_sizes.append(y_chunk_size)
                    elif d == "x":
                        chunk_sizes.append(x_chunk_size)
                    elif d == "time":
                        chunk_sizes.append(time_chunk_size)
                    else:
                        chunk_sizes.append(0)
                encodings[v]["chunksizes"] = chunk_sizes

    self.extend_latlon(ds, add_latlon=add_latlon, add_latlon_bnds=add_latlon_bnds)


    # ds = ds.rio.write_crs("EPSG:27700",grid_mapping_name="crsOSGB")

    ds.to_netcdf(to_path, encoding=encodings)

`save_as_geotif(ds, variable_name, to_path)` `staticmethod`

Save a CHUK dataset to a geotiff. DEPRECATED - use save_as_geotiff

Parameters:

Name	Type	Description	Default
`ds`	`Dataset`	the CHUK dataset	required
`variable_name`	`str`	the name of the variable to save from the dataset	required
`to_path`	`str`	the path to save the geotiff file to	required

Source code in eocis_chuk_api/chuk_dataset_utils.py

@staticmethod
def save_as_geotif(ds: xarray.Dataset, variable_name: str, to_path: str):
    """
    Save a CHUK dataset to a geotiff.  DEPRECATED - use save_as_geotiff

    Args:
        ds: the CHUK dataset
        variable_name: the name of the variable to save from the dataset
        to_path: the path to save the geotiff file to
    """
    return CHUKDataSetUtils.save_as_geotiff(ds, variable_name, to_path)

`save_as_geotiff(ds, variable_name, to_path)` `staticmethod`

Save a CHUK dataset to a geotiff

Parameters:

Name	Type	Description	Default
`ds`	`Dataset`	the CHUK dataset	required
`variable_name`	`str`	the name of the variable to save from the dataset	required
`to_path`	`str`	the path to save the geotiff file to	required

Source code in eocis_chuk_api/chuk_dataset_utils.py

@staticmethod
def save_as_geotiff(ds: xarray.Dataset, variable_name: str, to_path: str):
    """
    Save a CHUK dataset to a geotiff

    Args:
        ds: the CHUK dataset
        variable_name: the name of the variable to save from the dataset
        to_path: the path to save the geotiff file to
    """
    ds_crs = ds.rio.write_crs("EPSG:27700")
    if "grid_mapping" in ds_crs[variable_name].attrs:
        # this seems to cause a problem, why?
        del ds_crs[variable_name].attrs["grid_mapping"]
    tags = CHUKMetadata.to_json(ds_crs, variable_name)
    ds_crs[variable_name].rio.to_raster(to_path, tags=tags, driver="COG")

Working with CHUK Auxilary data

`CHUKAuxilaryDataCombinedMask`

Bases: Mask

Source code in eocis_chuk_api/chuk_auxilary_utils.py

class CHUKAuxilaryDataCombinedMask(Mask):

    def __init__(self, *masks:list[Mask], operator:str="or"):
        """
        Create a mask derived from one or more other masks

        Args:
            masks: a list of one or more masks to be combined
            operator: the operator to use, should be "not", "or" or "and"

        Throws:
            ValueError for example if the list of masks is empty or operator is not one of "not","and","or"
        """
        if len(masks) == 0:
            raise ValueError("masks must be a non-empty list")
        if operator not in ("and","or","not"):
            raise ValueError('operator must be one of "and", "or" or "not"')
        if operator == "not" and len(masks) > 1:
            raise ValueError("only one mask can be supplied for the not operator")
        self.input_masks = masks
        self.operator = operator

    def to_array(self):

        if self.operator == "not":
            m = self.input_masks[0].to_array()
            return xr.where(m,False,True)

        stacked = xr.concat([m.to_array() for m in self.input_masks], "layer")

        if self.operator == "or":
            return stacked.any(dim="layer")
        elif self.operator == "and":
            return stacked.all(dim="layer")

`init(*masks, operator='or')`

Create a mask derived from one or more other masks

Parameters:

Name	Type	Description	Default
`masks`	`list[Mask]`	a list of one or more masks to be combined	`()`
`operator`	`str`	the operator to use, should be "not", "or" or "and"	`'or'`

Throws

ValueError for example if the list of masks is empty or operator is not one of "not","and","or"

Source code in eocis_chuk_api/chuk_auxilary_utils.py

def __init__(self, *masks:list[Mask], operator:str="or"):
    """
    Create a mask derived from one or more other masks

    Args:
        masks: a list of one or more masks to be combined
        operator: the operator to use, should be "not", "or" or "and"

    Throws:
        ValueError for example if the list of masks is empty or operator is not one of "not","and","or"
    """
    if len(masks) == 0:
        raise ValueError("masks must be a non-empty list")
    if operator not in ("and","or","not"):
        raise ValueError('operator must be one of "and", "or" or "not"')
    if operator == "not" and len(masks) > 1:
        raise ValueError("only one mask can be supplied for the not operator")
    self.input_masks = masks
    self.operator = operator

`CHUKAuxilaryDataMask`

Bases: Mask

Source code in eocis_chuk_api/chuk_auxilary_utils.py

class CHUKAuxilaryDataMask(Mask):

    def __init__(self, dataset_name:str, variable_name:str, include_missing:bool=False):
        """
        Construct a mask associated with a particular dataset

        Args:
            dataset_name: the name of the dataset
            variable_name: the name of the variable in the dataset to use to construct the mask
            include_missing: whether to also include missing data values (eg NaN) in the mask
        """
        self.dataset_name = dataset_name
        self.variable_name = variable_name
        self.da = xr.open_dataset(dataset_name)[variable_name]
        meanings = self.da.attrs["flag_meanings"].split(" ")
        values = self.da.attrs["flag_values"]
        self.value_lookup = {}
        for (meaning, value) in zip(meanings, values):
            self.value_lookup[meaning] = value
        self.mask_values = []
        self.cached_result = None
        self.include_missing = include_missing

    def get_all_mask_values(self) -> list[str]:
        """
        Get a list of all the values that could be included in this mask

        Returns:
             a list of values
        """
        return list(self.value_lookup.keys())

    def get_selected_mask_values(self) -> list[str]:
        """
        Get a list of all the values that are included in this mask

        Returns:
             a list of values included in this mask
        """
        keys = []
        for mask_value in self.mask_values:
            keys += self.__get_matching_keys(mask_value)
        return keys

    def add_mask_value(self, mask_value: str):
        """
        Add a value to the mask

        Args:
            mask_value: the category value to include in the mask
        Throws:
            ValueError if the specified value is not a valid value for this mask
        """
        matching_keys = self.__get_matching_keys(mask_value)
        if len(matching_keys) == 0:
            raise ValueError(f"Value {mask_value} does not match any values {','.join(self.value_lookup.keys())}")
        self.cached_result = None
        self.mask_values.append(mask_value)
        return matching_keys

    def to_array(self) -> xr.DataArray:
        """
        Obtain the evaluated mask values
        Returns:
            an xarray DataArray object
        """
        if self.cached_result is None:
            filter_keys = []
            for mask_value in self.mask_values:
                filter_keys += self.__get_matching_keys(mask_value)
            filter_values = [self.value_lookup[key] for key in filter_keys]
            self.cached_result = xr.where(self.da.isin(filter_values), True, False)
            if self.include_missing:
                self.cached_result = xr.where(np.isnan(self.da),True,self.cached_result)
        return self.cached_result

    def __get_matching_keys(self, value_or_pattern):
        if value_or_pattern in self.value_lookup:
            return [value_or_pattern]
        matches = []
        for key in self.value_lookup:
            if fnmatch.fnmatch(key, value_or_pattern):
                matches.append(key)
        return matches

`init(dataset_name, variable_name, include_missing=False)`

Construct a mask associated with a particular dataset

Parameters:

Name	Type	Description	Default
`dataset_name`	`str`	the name of the dataset	required
`variable_name`	`str`	the name of the variable in the dataset to use to construct the mask	required
`include_missing`	`bool`	whether to also include missing data values (eg NaN) in the mask	`False`

Source code in eocis_chuk_api/chuk_auxilary_utils.py

def __init__(self, dataset_name:str, variable_name:str, include_missing:bool=False):
    """
    Construct a mask associated with a particular dataset

    Args:
        dataset_name: the name of the dataset
        variable_name: the name of the variable in the dataset to use to construct the mask
        include_missing: whether to also include missing data values (eg NaN) in the mask
    """
    self.dataset_name = dataset_name
    self.variable_name = variable_name
    self.da = xr.open_dataset(dataset_name)[variable_name]
    meanings = self.da.attrs["flag_meanings"].split(" ")
    values = self.da.attrs["flag_values"]
    self.value_lookup = {}
    for (meaning, value) in zip(meanings, values):
        self.value_lookup[meaning] = value
    self.mask_values = []
    self.cached_result = None
    self.include_missing = include_missing

`add_mask_value(mask_value)`

Add a value to the mask

Parameters:

Name	Type	Description	Default
`mask_value`	`str`	the category value to include in the mask	required

Throws: ValueError if the specified value is not a valid value for this mask

Source code in eocis_chuk_api/chuk_auxilary_utils.py

def add_mask_value(self, mask_value: str):
    """
    Add a value to the mask

    Args:
        mask_value: the category value to include in the mask
    Throws:
        ValueError if the specified value is not a valid value for this mask
    """
    matching_keys = self.__get_matching_keys(mask_value)
    if len(matching_keys) == 0:
        raise ValueError(f"Value {mask_value} does not match any values {','.join(self.value_lookup.keys())}")
    self.cached_result = None
    self.mask_values.append(mask_value)
    return matching_keys

`get_all_mask_values()`

Get a list of all the values that could be included in this mask

Returns:

Type	Description
`list[str]`	a list of values

Source code in eocis_chuk_api/chuk_auxilary_utils.py

def get_all_mask_values(self) -> list[str]:
    """
    Get a list of all the values that could be included in this mask

    Returns:
         a list of values
    """
    return list(self.value_lookup.keys())

`get_selected_mask_values()`

Get a list of all the values that are included in this mask

Returns:

Type	Description
`list[str]`	a list of values included in this mask

Source code in eocis_chuk_api/chuk_auxilary_utils.py

def get_selected_mask_values(self) -> list[str]:
    """
    Get a list of all the values that are included in this mask

    Returns:
         a list of values included in this mask
    """
    keys = []
    for mask_value in self.mask_values:
        keys += self.__get_matching_keys(mask_value)
    return keys

`to_array()`

Obtain the evaluated mask values Returns: an xarray DataArray object

Source code in eocis_chuk_api/chuk_auxilary_utils.py

def to_array(self) -> xr.DataArray:
    """
    Obtain the evaluated mask values
    Returns:
        an xarray DataArray object
    """
    if self.cached_result is None:
        filter_keys = []
        for mask_value in self.mask_values:
            filter_keys += self.__get_matching_keys(mask_value)
        filter_values = [self.value_lookup[key] for key in filter_keys]
        self.cached_result = xr.where(self.da.isin(filter_values), True, False)
        if self.include_missing:
            self.cached_result = xr.where(np.isnan(self.da),True,self.cached_result)
    return self.cached_result

`CHUKAuxilaryUtils`

Source code in eocis_chuk_api/chuk_auxilary_utils.py

class CHUKAuxilaryUtils:

    @staticmethod
    def create_mask(dataset_path:str, variable:str, mask_values:[str,list[str]], include_missing:bool=False) -> CHUKAuxilaryDataMask:
        """
        Create a mask

        Args:
            dataset_path: path to the netcdf file containing the auxilary data to use
            variable: the variable in the file to use in the mask
            mask_values: a string or list of strings
            include_missing: whether to include missing data values in the mask or not

        Returns:
            A mask object containing of True or False values for every cell
        """
        mask = CHUKAuxilaryDataMask(dataset_path, variable)
        if isinstance(mask_values, str):
            mask_values = [mask_values]
        for mask_value in mask_values:
            mask.add_mask_value(mask_value)
        return mask

    """
    Construct the logical AND of a list of masks

    Args:
        masks: the masks to combine

    Returns:
        Resulting mask
    """
    @staticmethod
    def combine_masks_and(*masks:list[Mask]):
        return masks[0].and_mask(*masks[1:])

    """
    Construct the logical OR of a list of masks

    Args:
        masks: the masks to combine

    Returns:
        Resulting mask
    """
    @staticmethod
    def combine_masks_or(*masks:list[Mask]):
        return masks[0].or_mask(*masks[1:])

    """
    Construct the logical NOT of a mask

    Args:
        masks: the masks to combine

    Returns:
        Resulting mask
    """
    @staticmethod
    def not_mask(mask:Mask):
        return mask.not_mask()

`create_mask(dataset_path, variable, mask_values, include_missing=False)` `staticmethod`

Create a mask

Parameters:

Name	Type	Description	Default
`dataset_path`	`str`	path to the netcdf file containing the auxilary data to use	required
`variable`	`str`	the variable in the file to use in the mask	required
`mask_values`	`[str, list[str]]`	a string or list of strings	required
`include_missing`	`bool`	whether to include missing data values in the mask or not	`False`

Returns:

Type	Description
`CHUKAuxilaryDataMask`	A mask object containing of True or False values for every cell

Source code in eocis_chuk_api/chuk_auxilary_utils.py

@staticmethod
def create_mask(dataset_path:str, variable:str, mask_values:[str,list[str]], include_missing:bool=False) -> CHUKAuxilaryDataMask:
    """
    Create a mask

    Args:
        dataset_path: path to the netcdf file containing the auxilary data to use
        variable: the variable in the file to use in the mask
        mask_values: a string or list of strings
        include_missing: whether to include missing data values in the mask or not

    Returns:
        A mask object containing of True or False values for every cell
    """
    mask = CHUKAuxilaryDataMask(dataset_path, variable)
    if isinstance(mask_values, str):
        mask_values = [mask_values]
    for mask_value in mask_values:
        mask.add_mask_value(mask_value)
    return mask

`Mask`

Bases: ABC

Source code in eocis_chuk_api/chuk_auxilary_utils.py

class Mask(abc.ABC):

    def __init__(self):
        """
        Abstract Base Class for masks, do not instantiate directly
        """
        pass

    def or_mask(self, *others:list["Mask"]) -> "Mask":
        """
        OR this mask with other masks

        Args:
            others: a list of masks to be OR'd with this mask
        Returns:
            a combined mask
        """
        return CHUKAuxilaryDataCombinedMask(self, *others, operator="or")

    def and_mask(self, *others:list["Mask"]) -> "Mask":
        """
        AND this mask with other masks

        Args:
            others: a list of masks to be AND'd with this mask
        Returns:
            a combined mask
        """
        return CHUKAuxilaryDataCombinedMask(self, *others, operator="and")


    def not_mask(self) -> "Mask":
        """
        Invert this mask

        Returns:
             a new mask that is the negation of this mask
        """
        return CHUKAuxilaryDataCombinedMask(self, operator="not")

    def count(self) -> int:
        """
        Count the number of True values in this mask

        Returns:
            the total number of True values
        """
        return int(self.to_array().sum())

    def fraction(self) -> float:
        """
        Calculate the fraction of values that are True in this mask

        Returns:
            the fraction of values that are True
        """
        m = self.to_array()
        count = 1
        for d in m.shape:
            count *= d
        return int(m.sum()) / count

    @abc.abstractmethod
    def to_array(self) -> xr.DataArray:
        """
        Convert this mask to an xarray.DataArray and return it

        Returns:
             an xarray.DataArray containing the mask values
        """
        pass # implemented in sub-classes

`init()`

Abstract Base Class for masks, do not instantiate directly

Source code in eocis_chuk_api/chuk_auxilary_utils.py

def __init__(self):
    """
    Abstract Base Class for masks, do not instantiate directly
    """
    pass

`and_mask(*others)`

AND this mask with other masks

Parameters:

Name	Type	Description	Default
`others`	`list[Mask]`	a list of masks to be AND'd with this mask	`()`

Returns: a combined mask

Source code in eocis_chuk_api/chuk_auxilary_utils.py

def and_mask(self, *others:list["Mask"]) -> "Mask":
    """
    AND this mask with other masks

    Args:
        others: a list of masks to be AND'd with this mask
    Returns:
        a combined mask
    """
    return CHUKAuxilaryDataCombinedMask(self, *others, operator="and")

`count()`

Count the number of True values in this mask

Returns:

Type	Description
`int`	the total number of True values

Source code in eocis_chuk_api/chuk_auxilary_utils.py

def count(self) -> int:
    """
    Count the number of True values in this mask

    Returns:
        the total number of True values
    """
    return int(self.to_array().sum())

`fraction()`

Calculate the fraction of values that are True in this mask

Returns:

Type	Description
`float`	the fraction of values that are True

Source code in eocis_chuk_api/chuk_auxilary_utils.py

def fraction(self) -> float:
    """
    Calculate the fraction of values that are True in this mask

    Returns:
        the fraction of values that are True
    """
    m = self.to_array()
    count = 1
    for d in m.shape:
        count *= d
    return int(m.sum()) / count

`not_mask()`

Invert this mask

Returns:

Type	Description
`Mask`	a new mask that is the negation of this mask

Source code in eocis_chuk_api/chuk_auxilary_utils.py

def not_mask(self) -> "Mask":
    """
    Invert this mask

    Returns:
         a new mask that is the negation of this mask
    """
    return CHUKAuxilaryDataCombinedMask(self, operator="not")

`or_mask(*others)`

OR this mask with other masks

Parameters:

Name	Type	Description	Default
`others`	`list[Mask]`	a list of masks to be OR'd with this mask	`()`

Returns: a combined mask

Source code in eocis_chuk_api/chuk_auxilary_utils.py

def or_mask(self, *others:list["Mask"]) -> "Mask":
    """
    OR this mask with other masks

    Args:
        others: a list of masks to be OR'd with this mask
    Returns:
        a combined mask
    """
    return CHUKAuxilaryDataCombinedMask(self, *others, operator="or")

`to_array()` `abstractmethod`

Convert this mask to an xarray.DataArray and return it

Returns:

Type	Description
`DataArray`	an xarray.DataArray containing the mask values

Source code in eocis_chuk_api/chuk_auxilary_utils.py

@abc.abstractmethod
def to_array(self) -> xr.DataArray:
    """
    Convert this mask to an xarray.DataArray and return it

    Returns:
         an xarray.DataArray containing the mask values
    """
    pass # implemented in sub-classes

Welcome to the eocis_chuk_api documentation

Creating, loading and exporting CHUK data

CHUKDataSetUtils

__init__(chuk_grid_path)

add_latlon(ds)

add_latlon_bnds(ds)

add_variable(to_dataset, data, variable_name, standard_name=None, long_name=None, units=None, source=None, **other_attrs)

check(ds)

create_filename(project, processing_level, product_type, product_string, datetime, version, additional_segregator=None, suffix='.nc')

get_grid_latlons()

get_grid_shape()

load(from_path, add_latlon=False, add_latlon_bnds=False)

sample(ds, to_resolution) staticmethod

save(ds, to_path, add_latlon=False, add_latlon_bnds=False, x_chunk_size=1000, y_chunk_size=1000, time_chunk_size=1, custom_encodings={}, override_encodings={})

save_as_geotif(ds, variable_name, to_path) staticmethod

save_as_geotiff(ds, variable_name, to_path) staticmethod

Working with CHUK Auxilary data

CHUKAuxilaryDataCombinedMask

__init__(*masks, operator='or')

CHUKAuxilaryDataMask

__init__(dataset_name, variable_name, include_missing=False)

add_mask_value(mask_value)

get_all_mask_values()

get_selected_mask_values()

to_array()

CHUKAuxilaryUtils

create_mask(dataset_path, variable, mask_values, include_missing=False) staticmethod

Mask

__init__()

and_mask(*others)

count()

fraction()

not_mask()

or_mask(*others)

to_array() abstractmethod

`CHUKDataSetUtils`

`init(chuk_grid_path)`

`add_latlon(ds)`

`add_latlon_bnds(ds)`

`add_variable(to_dataset, data, variable_name, standard_name=None, long_name=None, units=None, source=None, **other_attrs)`

`check(ds)`

`create_filename(project, processing_level, product_type, product_string, datetime, version, additional_segregator=None, suffix='.nc')`

`get_grid_latlons()`

`get_grid_shape()`

`load(from_path, add_latlon=False, add_latlon_bnds=False)`

`sample(ds, to_resolution)` `staticmethod`

`save(ds, to_path, add_latlon=False, add_latlon_bnds=False, x_chunk_size=1000, y_chunk_size=1000, time_chunk_size=1, custom_encodings={}, override_encodings={})`

`save_as_geotif(ds, variable_name, to_path)` `staticmethod`

`save_as_geotiff(ds, variable_name, to_path)` `staticmethod`

`CHUKAuxilaryDataCombinedMask`

`init(*masks, operator='or')`

`CHUKAuxilaryDataMask`

`init(dataset_name, variable_name, include_missing=False)`

`add_mask_value(mask_value)`

`get_all_mask_values()`

`get_selected_mask_values()`

`to_array()`

`CHUKAuxilaryUtils`

`create_mask(dataset_path, variable, mask_values, include_missing=False)` `staticmethod`

`Mask`

`init()`

`and_mask(*others)`

`count()`

`fraction()`

`not_mask()`

`or_mask(*others)`

`to_array()` `abstractmethod`