lactationcurve.preprocessing
1from .validate_and_standardize import ( 2 LACTATION_COLUMN_ALIASES, 3 PreparedInputs, 4 normalize_lactation_column_name, 5 resolve_lactation_column_mapping, 6 standardize_lactation_columns, 7 validate_and_prepare_inputs, 8) 9 10__all__ = [ 11 "LACTATION_COLUMN_ALIASES", 12 "PreparedInputs", 13 "normalize_lactation_column_name", 14 "resolve_lactation_column_mapping", 15 "standardize_lactation_columns", 16 "validate_and_prepare_inputs", 17]
194@dataclass 195class PreparedInputs: 196 """Normalized, ready‑to‑fit inputs. 197 198 This container is returned by `validate_and_prepare_inputs` and is the single 199 hand‑off object expected by the fitting routines. Arrays are finite and 1‑dimensional; 200 categorical fields are lower/upper‑cased as appropriate and may be `None` if omitted. 201 202 Attributes: 203 dim: 1D NumPy array of day‑in‑milk values (finite; same length as `milkrecordings`). 204 milkrecordings: 1D NumPy array of test‑day milk yields aligned to `dim`. 205 model: Lowercased model identifier or `None` if not provided. 206 fitting: `"frequentist"` or `"bayesian"` (lowercased) or `None`. 207 breed: `"H"` or `"J"` or `None`. 208 parity: Lactation number as `int`, if provided; otherwise `None`. 209 continent: Prior source for MilkBot API (`"USA"`, `"EU"`), or `None`. 210 persistency_method: Either `"derived"` or `"literature"`, or `None`. 211 lactation_length: Integer horizon (e.g., 305), the string `"max"`, or `None`. 212 milk_unit: Either `"kg"` or `"lb"`, defaulting to `"kg"`. 213 custom_priors: Either a dict of priors, the string `"CHEN"` to use Chen et al. priors, 214 or `None` if not provided. 215 """ 216 217 dim: np.ndarray 218 milkrecordings: np.ndarray 219 model: str | None = None 220 fitting: str | None = None 221 breed: str | None = None 222 parity: int | None = None 223 continent: str | None = None 224 persistency_method: str | None = None 225 lactation_length: int | str | None = None 226 milk_unit: str | None = None 227 custom_priors: MilkBotPriors | str | None = None
Normalized, ready‑to‑fit inputs.
This container is returned by validate_and_prepare_inputs and is the single
hand‑off object expected by the fitting routines. Arrays are finite and 1‑dimensional;
categorical fields are lower/upper‑cased as appropriate and may be None if omitted.
Attributes:
- dim: 1D NumPy array of day‑in‑milk values (finite; same length as
milkrecordings). - milkrecordings: 1D NumPy array of test‑day milk yields aligned to
dim. - model: Lowercased model identifier or
Noneif not provided. - fitting:
"frequentist"or"bayesian"(lowercased) orNone. - breed:
"H"or"J"orNone. - parity: Lactation number as
int, if provided; otherwiseNone. - continent: Prior source for MilkBot API (
"USA","EU"), orNone. - persistency_method: Either
"derived"or"literature", orNone. - lactation_length: Integer horizon (e.g., 305), the string
"max", orNone. - milk_unit: Either
"kg"or"lb", defaulting to"kg". - custom_priors: Either a dict of priors, the string
"CHEN"to use Chen et al. priors, orNoneif not provided.
124def normalize_lactation_column_name(column: str) -> str: 125 """Normalize a lactation-data column label for alias matching.""" 126 return re.sub(r"[^a-z0-9]+", "", str(column).strip().lower())
Normalize a lactation-data column label for alias matching.
140def resolve_lactation_column_mapping( 141 columns: Sequence[str], 142 *, 143 days_in_milk_col: str | None = None, 144 milking_yield_col: str | None = None, 145 test_id_col: str | None = None, 146 require_test_id: bool = False, 147) -> dict[str, str]: 148 """Resolve flexible lactation-data headers to canonical column names. 149 150 The returned mapping is ``canonical name -> original uploaded column``. 151 ``DaysInMilk`` and ``MilkingYield`` are required because the lactation 152 calculations cannot run without them. ``TestId`` is optional by default so 153 callers can choose whether to create a single-lactation fallback. 154 """ 155 156 columns_by_normalized_name: dict[str, str] = {} 157 for column in columns: 158 normalized = normalize_lactation_column_name(column) 159 if normalized and normalized not in columns_by_normalized_name: 160 columns_by_normalized_name[normalized] = column 161 162 resolved: dict[str, str] = {} 163 requested_columns = { 164 "DaysInMilk": days_in_milk_col, 165 "MilkingYield": milking_yield_col, 166 "TestId": test_id_col, 167 } 168 169 for canonical, requested in requested_columns.items(): 170 if requested: 171 match = columns_by_normalized_name.get(normalize_lactation_column_name(requested)) 172 if match is None: 173 raise ValueError(f"Column '{requested}' was not found.") 174 else: 175 match = _first_matching_column( 176 columns_by_normalized_name, LACTATION_COLUMN_ALIASES[canonical] 177 ) 178 179 if match is not None: 180 resolved[canonical] = match 181 182 missing_required = [ 183 canonical for canonical in ("DaysInMilk", "MilkingYield") if canonical not in resolved 184 ] 185 if require_test_id and "TestId" not in resolved: 186 missing_required.append("TestId") 187 if missing_required: 188 expected = ", ".join(missing_required) 189 raise ValueError(f"No lactation column found for: {expected}.") 190 191 return resolved
Resolve flexible lactation-data headers to canonical column names.
The returned mapping is canonical name -> original uploaded column.
DaysInMilk and MilkingYield are required because the lactation
calculations cannot run without them. TestId is optional by default so
callers can choose whether to create a single-lactation fallback.
390def standardize_lactation_columns( 391 df: pd.DataFrame, 392 *, 393 days_in_milk_col: str | None = None, 394 milking_yield_col: str | None = None, 395 test_id_col: str | None = None, 396 default_test_id=0, 397 max_dim: int | str = 305, 398) -> pd.DataFrame: 399 """ 400 Standardize column names and structure for lactation data. 401 402 Returns 403 ------- 404 df_out : pd.DataFrame 405 Copy of df with standardized columns: 406 - DaysInMilk 407 - MilkingYield 408 - TestId 409 """ 410 411 df = df.copy() 412 mapping = resolve_lactation_column_mapping( 413 [str(col) for col in df.columns], 414 days_in_milk_col=days_in_milk_col, 415 milking_yield_col=milking_yield_col, 416 test_id_col=test_id_col, 417 ) 418 dim_col = mapping["DaysInMilk"] 419 yield_col = mapping["MilkingYield"] 420 id_col = mapping.get("TestId") 421 422 # Create TestId if missing 423 if not id_col: 424 df["TestId"] = default_test_id 425 id_col = "TestId" 426 427 # Rename to standardized names 428 df = df.rename( 429 columns={ 430 dim_col: "DaysInMilk", 431 yield_col: "MilkingYield", 432 id_col: "TestId", 433 } 434 ) 435 436 # Filter DIM 437 if isinstance(max_dim, str) and max_dim.lower() == "max": 438 df = pd.DataFrame(df) 439 else: 440 df = pd.DataFrame(df[df["DaysInMilk"] <= int(max_dim)]) 441 442 return df
Standardize column names and structure for lactation data.
Returns
df_out : pd.DataFrame Copy of df with standardized columns: - DaysInMilk - MilkingYield - TestId
230def validate_and_prepare_inputs( 231 dim, 232 milkrecordings, 233 model=None, 234 fitting=None, 235 *, 236 breed=None, 237 parity=None, 238 continent=None, 239 persistency_method=None, 240 lactation_length=None, 241 milk_unit="kg", 242 custom_priors=None, 243) -> PreparedInputs: 244 """ 245 Validate, normalize, and clean input data for lactation curve fitting. 246 247 This function performs basic consistency checks on the provided 248 days-in-milk (DIM) and milk recording data, normalizes optional 249 categorical parameters, and removes observations with missing or 250 non-finite values. The cleaned and validated inputs are returned 251 in a structured :class:`PreparedInputs` object. 252 253 Parameters 254 ---------- 255 dim : array-like 256 Days in milk corresponding to each milk recording. 257 milkrecordings : array-like 258 Milk yield measurements corresponding to `dim`. 259 model : str or None, optional 260 Name of the lactation curve model. If provided, the name is 261 stripped of whitespace and converted to lowercase. 262 fitting : str or None, optional 263 Fitting approach to be used. Must be either ``"frequentist"`` 264 or ``"bayesian"`` if provided. 265 breed : str or None, optional 266 Cow breed identifier. Must be ``"H"`` (Holstein) or ``"J"`` 267 (Jersey) if provided. Case-insensitive. 268 parity : int or None, optional 269 Lactation number (parity). If provided, it is coerced to an 270 integer. 271 continent : str or None, optional 272 Geographic region identifier. Must be one of ``"USA"`` or 273 ``"EU"`` if provided. Case-insensitive. 274 milk_unit : str, optional 275 Unit of milk yield measurements. Must be either ``"kg"`` or ``"lb"``. Default is ``"kg"``. 276 custom_priors : dict or str or None, optional 277 Custom prior distributions for Bayesian fitting. If a dict is provided, 278 it must be a dictionary of prior distributions for each parameter in the model. 279 If the string ``"CHEN"`` is provided, the default Chen et al. priors are used. 280 281 Extra input for persistency calculation: 282 persistency_method (String): way of calculating 283 persistency, options: 'derived' which gives the 284 average slope of the lactation after the peak until 285 the end of lactation (default) or 'literature' for 286 the wood and milkbot model. 287 Lactation_length: string or int: length of the lactation 288 in days to calculate persistency over, options: 289 305 = default or 'max' uses the maximum DIM in the 290 data, or an integer value to set the desired 291 lactation length. 292 293 Returns 294 ------- 295 PreparedInputs 296 A dataclass containing the cleaned numeric arrays (`dim`, 297 `milkrecordings`) and the normalized optional parameters. 298 299 Raises 300 ------ 301 ValueError 302 If input arrays have different lengths, contain insufficient 303 valid observations, or if categorical parameters are invalid. 304 305 Notes 306 ----- 307 Observations with missing or non-finite values in either `dim` or 308 `milkrecordings` are removed prior to model fitting. At least two 309 valid observations are required to proceed. 310 """ 311 if len(dim) != len(milkrecordings): 312 raise ValueError("dim and milkrecordings must have the same length") 313 314 model = model.strip().lower() if model else None 315 316 if parity is not None: 317 parity = int(parity) 318 319 if fitting is not None: 320 fitting = fitting.lower() 321 if fitting not in {"frequentist", "bayesian"}: 322 raise ValueError("Fitting method must be either frequentist or bayesian") 323 324 if breed is not None: 325 breed = breed.upper() 326 if breed not in {"H", "J"}: 327 raise ValueError("Breed must be either Holstein = 'H' or Jersey 'J'") 328 329 if continent is not None: 330 continent = continent.upper() 331 if continent not in {"USA", "EU"}: 332 raise ValueError("continent must be 'USA' or 'EU'") 333 334 dim = np.asarray(dim, dtype=float) 335 milkrecordings = np.asarray(milkrecordings, dtype=float) 336 337 mask = np.isfinite(dim) & np.isfinite(milkrecordings) 338 dim = dim[mask] 339 milkrecordings = milkrecordings[mask] 340 341 if len(dim) < 2: 342 raise ValueError("At least two non missing points are required to fit a lactation curve") 343 344 if persistency_method is not None: 345 persistency_method = persistency_method.lower() 346 if persistency_method not in {"derived", "literature"}: 347 raise ValueError("persistency_method must be either 'derived' or 'literature'") 348 349 if lactation_length is not None: 350 if isinstance(lactation_length, str): 351 if lactation_length.lower() != "max": 352 raise ValueError("lactation_length string option must be 'max'") 353 else: 354 lactation_length = int(lactation_length) 355 356 if milk_unit not in ["kg", "lb"]: 357 raise ValueError("milk_unit must be 'kg' or 'lb'") 358 359 if custom_priors is not None and not isinstance(custom_priors, (dict, str)): 360 raise ValueError("custom_priors must be a dict, a string, or None") 361 362 if isinstance(custom_priors, str): 363 custom_priors = custom_priors.upper() 364 if custom_priors != "CHEN": 365 raise ValueError( 366 "custom_priors string option must be" 367 " 'CHEN', self defined priors can be" 368 " provided as a dictionary through" 369 " the build_prior function" 370 ) 371 372 if isinstance(custom_priors, dict): 373 custom_priors = cast(MilkBotPriors, custom_priors) 374 375 return PreparedInputs( 376 dim=dim, 377 milkrecordings=milkrecordings, 378 model=model or None, 379 fitting=fitting, 380 breed=breed, 381 parity=parity, 382 continent=continent, 383 persistency_method=persistency_method, 384 lactation_length=lactation_length, 385 milk_unit=milk_unit, 386 custom_priors=custom_priors, 387 )
Validate, normalize, and clean input data for lactation curve fitting.
This function performs basic consistency checks on the provided
days-in-milk (DIM) and milk recording data, normalizes optional
categorical parameters, and removes observations with missing or
non-finite values. The cleaned and validated inputs are returned
in a structured PreparedInputs object.
Parameters
dim : array-like
Days in milk corresponding to each milk recording.
milkrecordings : array-like
Milk yield measurements corresponding to dim.
model : str or None, optional
Name of the lactation curve model. If provided, the name is
stripped of whitespace and converted to lowercase.
fitting : str or None, optional
Fitting approach to be used. Must be either "frequentist"
or "bayesian" if provided.
breed : str or None, optional
Cow breed identifier. Must be "H" (Holstein) or "J"
(Jersey) if provided. Case-insensitive.
parity : int or None, optional
Lactation number (parity). If provided, it is coerced to an
integer.
continent : str or None, optional
Geographic region identifier. Must be one of "USA" or
"EU" if provided. Case-insensitive.
milk_unit : str, optional
Unit of milk yield measurements. Must be either "kg" or "lb". Default is "kg".
custom_priors : dict or str or None, optional
Custom prior distributions for Bayesian fitting. If a dict is provided,
it must be a dictionary of prior distributions for each parameter in the model.
If the string "CHEN" is provided, the default Chen et al. priors are used.
Extra input for persistency calculation:
persistency_method (String): way of calculating persistency, options: 'derived' which gives the average slope of the lactation after the peak until the end of lactation (default) or 'literature' for the wood and milkbot model. Lactation_length: string or int: length of the lactation in days to calculate persistency over, options: 305 = default or 'max' uses the maximum DIM in the data, or an integer value to set the desired lactation length.
Returns
PreparedInputs
A dataclass containing the cleaned numeric arrays (dim,
milkrecordings) and the normalized optional parameters.
Raises
ValueError If input arrays have different lengths, contain insufficient valid observations, or if categorical parameters are invalid.
Notes
Observations with missing or non-finite values in either dim or
milkrecordings are removed prior to model fitting. At least two
valid observations are required to proceed.