Skip to content

terraflow.validation

Spatial block cross-validation (Roberts et al. 2017), Cohen's kappa against an optional reference CSV, and Moran's I on score residuals. Invoked from the CLI as terraflow validate -c config.yml; results are appended to report.json under the validation key.

API Reference

validation

Model validation module — spatial block CV, Cohen's kappa, Moran's I.

run_validation(config_path)

Run model validation and append a validation block to report.json.

Loads the TerraFlow config, locates the most recent pipeline run directory (the one with the latest features.parquet), computes spatial block CV, Moran's I on score residuals, and optionally Cohen's kappa against a reference CSV. Results are written atomically to the existing report.json under the "validation" key.

Parameters:

Name Type Description Default
config_path Path

Path to a TerraFlow YAML config file that includes a validation: section.

required

Returns:

Name Type Description
Path Path

Absolute path to the updated report.json.

Raises:

Type Description
ValueError:

If the config has no validation: section.

FileNotFoundError:

If no pipeline run directory containing features.parquet is found.

Source code in terraflow/validation.py
def run_validation(config_path: Path) -> Path:
    """Run model validation and append a validation block to report.json.

    Loads the TerraFlow config, locates the most recent pipeline run directory
    (the one with the latest ``features.parquet``), computes spatial block CV,
    Moran's I on score residuals, and optionally Cohen's kappa against a
    reference CSV. Results are written atomically to the existing ``report.json``
    under the ``"validation"`` key.

    Parameters
    ----------
    config_path:
        Path to a TerraFlow YAML config file that includes a ``validation:``
        section.

    Returns
    -------
    Path:
        Absolute path to the updated ``report.json``.

    Raises
    ------
    ValueError:
        If the config has no ``validation:`` section.
    FileNotFoundError:
        If no pipeline run directory containing ``features.parquet`` is found.
    """
    data = load_config_dict(config_path)
    cfg = build_config(data)

    if cfg.validation is None:
        raise ValueError(
            "Config file has no 'validation:' section. "
            "Add a validation: block with optional n_blocks_side, buffer_deg, "
            "and reference_csv fields. See TerraFlow documentation for details."
        )

    val_cfg = cfg.validation
    config_dir = Path(config_path).resolve().parent

    run_dir = resolve_run_dir(config_path)
    features_path = run_dir / "features.parquet"
    if not features_path.exists():
        raise FileNotFoundError(
            f"No pipeline run found at {run_dir}. "
            "Run `terraflow run -c config.yml` before running validation."
        )

    logger.info(f"Running validation on {run_dir}")

    # Load features
    df = pd.read_parquet(features_path)
    lats = df["lat"].values
    lons = df["lon"].values
    labels = df["label"].values
    scores = df["score"].values

    # Spatial block CV
    fold_accs = _spatial_block_cv(
        lats,
        lons,
        labels,
        n_blocks_side=val_cfg.n_blocks_side,
        buffer_deg=val_cfg.buffer_deg,
    )
    mean_fold_accuracy: Optional[float] = (
        float(np.mean(fold_accs)) if fold_accs else None
    )

    # Moran's I on score residuals
    morans_i_val = _morans_i(lats, lons, scores - float(scores.mean()))

    # Cohen's kappa (optional)
    kappa: Optional[float] = None
    n_ref: Optional[int] = None
    if val_cfg.reference_csv is not None:
        ref_path = (config_dir / val_cfg.reference_csv).resolve()
        reference_df = pd.read_csv(ref_path)
        n_ref = len(reference_df)
        kappa = _compute_kappa(df, reference_df)
        logger.info(
            f"Cohen's kappa computed from {n_ref} reference points: {kappa:.4f}"
        )

    # Read existing report.json and append validation block
    report_path = run_dir / "report.json"
    if report_path.exists():
        with report_path.open("r", encoding="utf-8") as fh:
            report: Dict[str, Any] = json.load(fh)
    else:
        report = {}

    report["validation"] = {
        "method": "spatial_block_cv",
        "citation": "Roberts et al. 2017, Ecography",
        "n_blocks_side": val_cfg.n_blocks_side,
        "buffer_deg": val_cfg.buffer_deg,
        "n_folds": len(fold_accs),
        "mean_fold_accuracy": mean_fold_accuracy,
        "cohen_kappa": kappa,
        "morans_i_residuals": morans_i_val,
        "kriging_loocv_rmse": report.get("kriging_loocv"),
        "reference_dataset": (
            str(val_cfg.reference_csv) if val_cfg.reference_csv else None
        ),
        "n_reference_points": n_ref if kappa is not None else None,
        "note": (
            "model has no free parameters; fold accuracy reflects spatial "
            "label consistency, not fit generalization"
        ),
    }

    _atomic_write_json(report_path, report)
    logger.info(f"Validation block written to {report_path}")

    return report_path