#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
This module defines a rastertool named zonalstats that compute several statistics
(mean, median, std dev, etc) on one or several bands of a raster image. The statistics
can be computed on the whole image or by zones defined by a vector file (e.g. shapefile,
geojson).
Several options are provided:
* compute outliers: enable to generate an image that emphasizes the outliers pixels
(i.e. pixels with values greater that mean + n x stddev where n can be parametrized).
* generate a chart: if several raster images are computed and if we can extract a date from the
filenames (because the raster is of a known type), generate one chart
per statistics (x=time, y=stats)
"""
from typing import List, Dict
import datetime
import logging.config
from pathlib import Path
import json
import numpy as np
import geopandas as gpd
import sys
import rasterio
from eolab.georastertools import utils
from eolab.georastertools import Rastertool, RastertoolConfigurationException
from eolab.georastertools.processing import compute_zonal_stats, compute_zonal_stats_per_category
from eolab.georastertools.processing import extract_zonal_outliers, plot_stats
from eolab.georastertools.processing import vector
from eolab.georastertools.product import RasterProduct
_logger = logging.getLogger(__name__)
[docs]class Zonalstats(Rastertool):
"""
Raster tool that computes zonal statistics of a raster product.
"""
supported_output_formats = {
'ESRI Shapefile': 'shp',
'GeoJSON': 'geojson',
'CSV': 'csv',
'GPKG': 'gpkg',
'GML': 'gml'
}
"""List of all possible output format are provided by fiona.supported_drivers.keys()"""
valid_stats = ['count', 'valid', 'nodata',
'min', 'max', 'mean', 'std', 'sum', 'median', "mad", 'range',
'majority', 'minority', 'unique']
"""List of stats that can be computed. In addition to this list, "percentile_xx" is
also a valid stat where xx is the percentile value, e.g. percentile_70"""
[docs] def __init__(self, stats: List[str], categorical: bool = False, valid_threshold: float = 0.0,
area: bool = False, prefix: str = None, bands: List[int] = [1]):
""" Constructor
Args:
stats ([str]):
List of stats to compute. Zonalstats.valid_stats defined the list of valid stats
except percentile which can be defined as string concatenating percentile\\_ with
the percentile value (from 0 to 100).
categorical (bool, optional, default=False):
If true and input raster is "categorical", add the counts of every unique
raster values to the stats
valid_threshold (float, optional, default=0.0):
Minimum percentage of valid pixels in a shape to compute its statistics ([0.0, 1.0])
area (bool, optional, default=False):
If true, statistics are multiplied by the pixel area of the raster input
prefix (str, optional, default=None]):
Add a prefix to the stats keys, one prefix per band. The argument is a string
with all prefixes separated by a space.
bands ([int], optional, default=[1]):
List of bands in the input image to process.
Set None if all bands shall be processed.
"""
super().__init__()
self._stats = stats
self._categorical = categorical
value = valid_threshold or 0.0
if value < 0.0 or value > 1.0:
raise RastertoolConfigurationException(
f"Valid threshold must be in range [0.0, 1.0].")
if value > 1e-5 and "valid" not in stats:
raise RastertoolConfigurationException(
"Cannot apply a valid threshold when the computation "
"of the valid stat has not been requested.")
self._valid_threshold = value
self._area = area
self._prefix = prefix.split() if prefix else None
self.__check_stats()
self._bands = bands
self._output_format = "ESRI Shapefile"
self._geometries = None
self._within = False
self._sigma = None
self._chart_file = None
self._geometry_index = 'ID'
self._display_chart = False
self._category_file = None
self._category_file_type = None
self._category_index = None
self._category_labels = None
self._generated_stats = list()
self._generated_stats_dates = list()
@property
def generated_stats_per_date(self):
"""After processing one or several files, this method enables to retrieve a dictionary
that contains the statistics for each inputfile's date:
- keys are timestamps
- values are the statistics at the corresponding timestam
Warning:
When the timestamp of the input raster cannot be retrieved, the dictionary does not
contain the generated statistics for this input raster. In this case, prefer calling
generated_stats to get the stats as a list (one item per input file).
"""
out = dict()
if len(self._generated_stats_dates) > 0:
out = {date: stats
for (date, stats) in zip(self.generated_stats_dates, self.generated_stats)}
return out
@property
def generated_stats(self):
"""The list of generated stats in the same order as the input files"""
return self._generated_stats
@property
def generated_stats_dates(self):
"""The list of dates when they can be extracted from the input files' names"""
return self._generated_stats_dates
@property
def stats(self) -> List[str]:
"""List of stats to compute"""
return self._stats
@property
def categorical(self) -> bool:
"""Whether to compute the counts of every unique pixel values"""
return self._categorical
@property
def valid_threshold(self) -> float:
"""Minimum percentage of valid pixels in a shape to compute its statistics"""
return self._valid_threshold
@property
def area(self) -> bool:
"""Whether to compute the statistics multiplied by the pixel area"""
return self._area
@property
def prefix(self) -> str:
"""Prefix of the features stats (one per band)"""
return self._prefix
@property
def bands(self) -> List[int]:
"""List of bands to process"""
return self._bands
@property
def output_format(self) -> str:
"""Output format for the features stats"""
return self._output_format
@property
def geometries(self) -> str:
"""The geometries where to compute zonal statistics"""
return self._geometries
@property
def within(self) -> bool:
"""Whether to compute stats for geometries within the raster (if False, stats for
all geometries intersecting the raster are computed)"""
return self._within
@property
def sigma(self) -> float:
"""Number of sigmas for identifying outliers"""
return self._sigma
@property
def chart_file(self) -> str:
"""Name of the chart file to generate"""
return self._chart_file
@property
def geometry_index(self) -> str:
"""The column name identifying the name of the geometry"""
return self._geometry_index
@property
def display_chart(self) -> bool:
"""Whether to display the chart"""
return self._display_chart
@property
def category_file(self) -> str:
"""Filename containing the categories when computing stats per
categories in the geometries"""
return self._category_file
@property
def category_file_type(self) -> str:
"""Type of the category file, either 'raster' or 'vector'"""
return self._category_file_type
@property
def category_index(self) -> str:
"""Column name identifying categories in categroy_file (only if file format
is geometries)
"""
return self._category_index
@property
def category_labels(self) -> str:
"""Dict with classes index as keys and names to display as values"""
return self._category_labels
def __check_stats(self):
"""Check that the requested stats are valid.
Args:
stats_to_compute ([str]):
List of stats to compute
"""
for x in self._stats:
# check percentile format
if x.startswith("percentile_"):
q = float(x.replace("percentile_", ''))
# percentile must be in range [0, 100]
if q > 100.0:
raise RastertoolConfigurationException('percentiles must be <= 100')
if q < 0.0:
raise RastertoolConfigurationException('percentiles must be >= 0')
elif x not in Zonalstats.valid_stats:
raise RastertoolConfigurationException(
f"Invalid stat {x}: must be "
f"percentile_xxx or one of {Zonalstats.valid_stats}")
[docs] def with_output(self, outputdir: str = ".", output_format: str = "ESRI Shapefile"):
"""Set up the output.
Args:
outputdir (str, optional, default="."):
Output dir where to store results. If none, results are not dumped to a file.
output_format (str, optional, default="ESRI Shapefile"):
Format of the output 'ESRI Shapefile', 'GeoJSON', 'CSV', 'GPKG', 'GML'
(see supported_output_formats). If None, it is set to ESRI Shapefile
Returns:
:obj:`eolab.georastertools.Zonalstats`: the current instance so that it is
possible to chain the with... calls (fluent API)
"""
super().with_output(outputdir)
self._output_format = output_format or 'ESRI Shapefile'
# check if output_format exists
if self._output_format not in Zonalstats.supported_output_formats:
raise RastertoolConfigurationException(
f"Unrecognized output format {output_format}. "
f"Possible values are {', '.join(Zonalstats.supported_output_formats)}")
return self
[docs] def with_geometries(self, geometries: str, within: bool = False):
"""Set up the geometries where to compute stats.
Args:
geometries (str):
Name of the file containing the geometries where to compute zonal stats. If not set,
stats are computed on the whole raster image
within (bool, optional, default=False):
Whether to compute stats only for geometries within the raster. If False,
statistics are computed for geometries that intersect the raster shape.
Returns:
:obj:`eolab.georastertools.Zonalstats`: the current instance so that it is
possible to chain the with... calls (fluent API)
"""
self._geometries = geometries
self._within = within
return self
[docs] def with_outliers(self, sigma: float):
"""Set up the computation of outliers
Args:
sigma (float):
Distance to the mean value to consider a pixel as an outlier (expressed
in sigma, e.g. the value 2 means that pixels values greater than
mean value + 2 * std are outliers)
Returns:
:obj:`eolab.georastertools.Zonalstats`: the current instance so that it is
possible to chain the with... calls (fluent API)
"""
# Manage sigma computation option that requires mean + std dev computation
if "mean" not in self._stats:
self._stats.append("mean")
if "std" not in self._stats:
self._stats.append("std")
self._sigma = sigma
return self
[docs] def with_chart(self, chart_file: str = None, geometry_index: str = 'ID', display: bool = False):
"""Set up the charting capability
Args:
chart_file (str, optional, default=None):
If not None, generate a chart with the statistics and saves it to str
geometry_index (str, optional, default='ID'):
Name of the index in the geometry file
display (bool, optional, default=False):
If true, display the chart with the statistics
Returns:
:obj:`eolab.georastertools.Zonalstats`: the current instance so that it is
possible to chain the with... calls (fluent API)
"""
self._chart_file = chart_file
self._geometry_index = geometry_index
self._display_chart = display
return self
[docs] def with_per_category(self, category_file: str, category_index: str = 'Classe',
category_labels_json: str = None):
"""Set up the zonal stats computation per categories
Args:
category_file (str):
Name of the file containing the categories
category_index (str, optional, default='Classe'):
Name of column containing the category value (if category_file is a vector)
category_labels_json (str, optional, default=None):
Name of json file containing the dict that associates category values
to category names
Returns:
:obj:`eolab.georastertools.Zonalstats`: the current instance so that it is
possible to chain the with... calls (fluent API)
"""
self._category_file = category_file
self._category_index = category_index
# get the category file type
if category_file:
suffix = utils.get_suffixes(category_file)
if suffix[1:] in Zonalstats.supported_output_formats.values():
self._category_file_type = "vector"
else:
# not a vector, maybe a raster? try to open it with rasterio
try:
rasterio.open(category_file)
except IOError:
raise RastertoolConfigurationException(
f"File {category_file} cannot be read: check format and existence")
self._category_file_type = "raster"
# get the dict of category labels
if category_labels_json:
try:
with open(category_labels_json) as f:
self._category_labels = json.load(f)
except Exception as err:
raise RastertoolConfigurationException(
f"File {category_labels_json} does not contain a valid dict.") from err
return self
[docs] def process_file(self, inputfile: str) -> List[str]:
"""Compute the stats for a single input file
Args:
inputfile (str):
Input image to process
Returns:
[str]: List of generated statistical images (posix paths) that have been generated
"""
_logger.info(f"Processing file {inputfile}")
# STEP 1: Prepare the input image so that it can be processed
_logger.info("Prepare the input for computation")
with RasterProduct(inputfile, vrt_outputdir=self.vrt_dir) as product:
if product.rastertype is None and self.chart_file:
_logger.error("Unrecognized raster type of input file,"
" cannot extract date for plotting")
# open raster to get metadata
raster = product.get_raster()
rst = rasterio.open(raster)
bound = int(rst.count)
indexes = rst.indexes
descr = rst.descriptions
geotransform = rst.get_transform()
width = np.abs(geotransform[1])
height = np.abs(geotransform[5])
area_square_meter = width * height
rst.close()
date_str = product.get_date_string('%Y%m%d-%H%M%S')
# check band index and handle all bands options (when bands is None)
if self.bands is None or len(self.bands) == 0:
bands = indexes
else:
bands = self.bands
if min(bands) < 1 or max(bands) > bound:
raise ValueError(f"Invalid bands, all values are not in range [1, {bound}]")
# check the prefix
if self.prefix and len(self.prefix) != len(bands):
raise ValueError("Number of prefix does not equal the number of bands.")
# STEP 2: Prepare the geometries where to compute zonal stats
if self.geometries:
# reproject & filter input geometries to fit the raster extent
geometries = vector.reproject(
vector.filter(self.geometries, raster, self.within), raster)
else:
# if no geometry is defined, get the geometry from raster shape
geometries = vector.get_raster_shape(raster)
# STEP 3: Compute the statistics
geom_stats = self.compute_stats(raster, bands, geometries,
descr, date_str, area_square_meter)
self._generated_stats.append(geom_stats)
if date_str:
timestamp = datetime.datetime.strptime(date_str, '%Y%m%d-%H%M%S')
self._generated_stats_dates.append(timestamp)
# STEP 4: Generate outputs
outputs = []
if self.outputdir:
outdir = Path(self.outputdir)
ext = Zonalstats.supported_output_formats[self.output_format]
outputname = f"{utils.get_basename(inputfile)}-stats.{ext}"
outputfile = outdir.joinpath(outputname)
geom_stats.to_file(outputfile.as_posix(), driver=self.output_format)
outputs.append(outputfile.as_posix())
# if sigma is not None, generate the outliers image
if self.sigma:
_logger.info("Extract outliers")
outliersfile = outdir.joinpath(
f"{utils.get_basename(inputfile)}-stats-outliers.tif")
extract_zonal_outliers(geom_stats, raster, outliersfile.as_posix(),
prefix=self.prefix or [""] * len(bands),
bands=bands, sigma=self.sigma)
outputs.append(outliersfile.as_posix())
return outputs
[docs] def postprocess_files(self, inputfiles: List[str], outputfiles: List[str]) -> List[str]:
"""Generate the chart if requested after computing stats for each input file
Args:
inputfiles ([str]): Input images to process
outputfiles ([str]): List of generated files after executing the
rastertool on the input files
Returns:
[str]: A list containing the chart file if requested
"""
additional_outputs = []
if self.chart_file and len(self.generated_stats_per_date) > 0:
_logger.info("Generating chart")
plot_stats(self.chart_file, self.generated_stats_per_date,
self.stats, self.geometry_index, self.display_chart)
additional_outputs.append(self.chart_file)
return additional_outputs
[docs] def compute_stats(self, raster: str, bands: List[int],
geometries: gpd.GeoDataFrame,
descr: List[str], date: str,
area_square_meter: int) -> List[List[Dict[str, float]]]:
"""Compute the statistics of the input data. [Minimum, Maximum, Mean, Standard deviation]
Args:
raster (str):
Input image to process
bands ([int]):
List of bands in the input image to process. Empty list means all bands
geometries (GeoDataFrame):
Geometries where to add statistics (geometries must be in the same
projection as the raster)
descr ([str]):
Band descriptions
date (str):
Timestamp of the input raster
area_square_meter (int):
Area represented by a pixel
Returns:
list[list[dict]]
The dictionnary associates the name of the statistics to its value.
"""
_logger.info("Compute statistics")
# Compute zonal statistics
if self.category_file is not None:
# prepare the categories data
if self.category_file_type == "vector":
# clip categories to the raster bounds and reproject in the raster crs
class_geom = vector.reproject(
vector.clip(self.category_file, raster),
raster)
else: # filetype is raster
# vectorize the raster and reproject in the raster crs
class_geom = vector.reproject(
vector.vectorize(self.category_file, raster, self.category_index),
raster)
# compute the statistics per category
statistics = compute_zonal_stats_per_category(
geometries, raster,
bands=bands,
stats=self.stats,
categories=class_geom,
category_index=self.category_index,
category_labels=self.category_labels)
else:
statistics = compute_zonal_stats(
geometries, raster,
bands=bands,
stats=self.stats,
categorical=self.categorical)
# apply area
if self.area:
[d.update({key: area_square_meter * val})
for s in statistics
for d in s for key, val in d.items() if not np.isnan(val)]
# convert statistics to GeoDataFrame
geom_stats = self.__stats_to_geoms(statistics, geometries, bands, descr, date)
return geom_stats
def __stats_to_geoms(self, statistics_data: List[List[Dict[str, float]]],
geometries: gpd.GeoDataFrame,
bands: List[int], descr: List[str], date: str) -> gpd.GeoDataFrame:
"""Appends statistics to the geodataframe.
Args:
statistics_data:
A list of list of dictionnaries. Dict associates the stat names and the stat values.
geometries (GeoDataFrame):
Geometries where to add statistics
bands ([int]):
List of bands in the input image to process. Empty list means all bands
descr ([str]):
Bands descriptions to add to global metadata
date (str):
Date of raster to add to global metadata
Returns:
GeoDataFrame: The updated geometries with statistics saved in metadata of
the following form: b{band_number}.{metadata_name} where metadata_name is
successively the band name, the date and the statistics names (min, mean, max, median, std)
"""
prefix = self.prefix or [""] * len(bands)
for i, band in enumerate(bands):
# add general metadata to geometries
if descr and descr[i]:
geometries[utils.get_metadata_name(band, prefix[i], "name")] = descr[i]
if date:
geometries[utils.get_metadata_name(band, prefix[i], "date")] = date
# get all statistics names since additional statistics coming from categorical
# option may have been computed
stats = self.stats.copy()
categorical_stats = set()
[categorical_stats.update(s[i].keys()) for s in statistics_data]
if self.category_file is None:
# remove stats from the categorical stats
# and add the categorical stats to the stats
# remark: this operation seems strange but it ensures that stats are
# in the correct order
categorical_stats -= set(stats)
stats.extend(categorical_stats)
else:
# per_category mode do not compute overall stats.
# So stats is not exended but replaced
stats = categorical_stats
for stat in stats:
cond = self.valid_threshold < 1e-5 or stat == "valid"
metadataname = utils.get_metadata_name(band, prefix[i], stat)
geometries[metadataname] = [
s[i][stat]
if stat in s[i] and (cond or s[i]["valid"] > self.valid_threshold) else np.nan
for s in statistics_data
]
return geometries