Source code for src.toolbox.steps.custom.gen_data

# This file is part of the NOC Autonomy Toolbox.
#
# Copyright 2025-2026 National Oceanography Centre and The Contributors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Step for generating synthetic data for testing pipelines"""

import polars as pl
import xarray as xr
import numpy as np
from ..base_step import BaseStep, register_step
from datetime import date, timedelta


@register_step
[docs] class GenerateData(BaseStep): """ Step for generating synthetic data for testing pipelines. Example config setup: """
[docs] step_name = "Generate Data"
[docs] required_variables = []
[docs] provided_variables = ["TIME", "LATITUDE", "LONGITUDE", "PRES", "TEMP", "CNDC"]
[docs] def run(self): # Check if the data is already in the context if "data" in self.context: raise ValueError( "[Generate Data] WARNING: Data found in context. This will be replaced by generated data." ) if self.gen_fixed_data: self.log("Generating fixed data") import itertools ncols = 2 column_names = ["A", "B", "C"][:ncols] qc_values = np.array(list(itertools.product(range(10), repeat=ncols))) values = [[i] * int(10**ncols) for i in range(1, ncols + 1)] df = pl.DataFrame( { **{col: values[i] for i, col in enumerate(column_names)}, **{ f"{col}_QC": qc_values[:, i] for i, col in enumerate(column_names) }, } ) else: self.log("Generating random data") # Load config parameters start_date, end_date, sample_period = self.parameters["sampling_info"] additional_variables = self.parameters["additional_variables"] user_value_limits = self.parameters["value_limits"] diagnostics = self.parameters["diagnostics"] # Add aditional variables variable_names = {"LATITUDE", "LONGITUDE", "PRES", "TEMP", "CNDC"} variable_names.update(additional_variables) # Define variable limits and update with user values variable_limits = { "LATITUDE": [-90, 90], # Degrees "LONGITUDE": [-180, 180], # Degrees "PRES": [0, 100], # Bar "TEMP": [0, 20], # Celcius "CNDC": [34, 35], # S/m } variable_limits.update(user_value_limits) if diagnostics: self.log(f"[Generate Data] Variables: {variable_limits}") # Make time index for dataframe (df) df = pl.select( pl.datetime_range( date(*map(int, start_date.split("-"))), date(*map(int, end_date.split("-"))), timedelta(seconds=sample_period), time_unit="ns", ).alias("TIME") ) data_length = len(df) # Generate random data for the remaining variables for variable_name in variable_names: # Check the limits if variable_name in variable_limits.keys(): lower, upper = variable_limits[variable_name] if upper <= lower: raise ValueError( f"Upper limit must be greater than lower limit for {variable_name}" ) else: self.log( f"The additional variable {variable_name} has not been set limits. Defaulting to [0, 1]." ) lower, upper = [0, 1] # Add the new column df = df.with_columns( pl.lit(np.random.uniform(lower, upper, data_length)).alias( variable_name ) ) # Make the xarray data from the polars dataframe and ship it # TODO: Add metadata flexibility data = df.to_pandas().to_xarray() data["N_PARAM"] = list(data.keys()) data = data.rename({"index": "N_MEASUREMENTS"}) self.context["data"] = data return self.context