lactationcurve.preprocessing

View Source

 1from .validate_and_standardize import (
 2    LACTATION_COLUMN_ALIASES,
 3    PreparedInputs,
 4    normalize_lactation_column_name,
 5    resolve_lactation_column_mapping,
 6    standardize_lactation_columns,
 7    validate_and_prepare_inputs,
 8)
 9
10__all__ = [
11    "LACTATION_COLUMN_ALIASES",
12    "PreparedInputs",
13    "normalize_lactation_column_name",
14    "resolve_lactation_column_mapping",
15    "standardize_lactation_columns",
16    "validate_and_prepare_inputs",
17]

LACTATION_COLUMN_ALIASES = {'DaysInMilk': ('DaysInMilk', 'Days in Milk', 'Days In Milk', 'Days_In_Milk', 'DIM', 'Dim', 'Days', 'Day', 'TestDay', 'Test Day', 'LactationDay', 'Lactation Day'), 'MilkingYield': ('MilkingYield', 'Milking Yield', 'Milking_Yield', 'DailyMilkingYield', 'Daily Milking Yield', 'Daily Milking Yield (kg)', 'DailyMilkYield', 'Daily Milk Yield', 'TestDayMilkYield', 'Test Day Milk Yield', 'MilkYield', 'Milk Yield', 'Milk Yield (kg)', 'Milk_Yield', 'Milk kg', 'milk_kg', 'MILK', 'Milk', 'Yield', 'MilkProduction', 'Milk Production', 'MilkRecording', 'Milk Recording'), 'TestId': ('TestId', 'Test ID', 'Test_ID', 'CowId', 'Cow ID', 'Cow_ID', 'Cow', 'AnimalId', 'Animal ID', 'Animal_ID', 'AnimalNumber', 'Animal Number', 'Animal', 'EarNumber', 'Ear Number', 'Oornummer', 'Koe', 'Diernummer', 'LactationId', 'Lactation ID', 'Lactation', 'ID', 'Id')}

@dataclass

class PreparedInputs: View Source

194@dataclass
195class PreparedInputs:
196    """Normalized, ready‑to‑fit inputs.
197
198    This container is returned by `validate_and_prepare_inputs` and is the single
199    hand‑off object expected by the fitting routines. Arrays are finite and 1‑dimensional;
200    categorical fields are lower/upper‑cased as appropriate and may be `None` if omitted.
201
202    Attributes:
203        dim: 1D NumPy array of day‑in‑milk values (finite; same length as `milkrecordings`).
204        milkrecordings: 1D NumPy array of test‑day milk yields aligned to `dim`.
205        model: Lowercased model identifier or `None` if not provided.
206        fitting: `"frequentist"` or `"bayesian"` (lowercased) or `None`.
207        breed: `"H"` or `"J"` or `None`.
208        parity: Lactation number as `int`, if provided; otherwise `None`.
209        continent: Prior source for MilkBot API (`"USA"`, `"EU"`), or `None`.
210        persistency_method: Either `"derived"` or `"literature"`, or `None`.
211        lactation_length: Integer horizon (e.g., 305), the string `"max"`, or `None`.
212        milk_unit: Either `"kg"` or `"lb"`, defaulting to `"kg"`.
213        custom_priors: Either a dict of priors, the string `"CHEN"` to use Chen et al. priors,
214            or `None` if not provided.
215    """
216
217    dim: np.ndarray
218    milkrecordings: np.ndarray
219    model: str | None = None
220    fitting: str | None = None
221    breed: str | None = None
222    parity: int | None = None
223    continent: str | None = None
224    persistency_method: str | None = None
225    lactation_length: int | str | None = None
226    milk_unit: str | None = None
227    custom_priors: MilkBotPriors | str | None = None

Normalized, ready‑to‑fit inputs.

This container is returned by validate_and_prepare_inputs and is the single hand‑off object expected by the fitting routines. Arrays are finite and 1‑dimensional; categorical fields are lower/upper‑cased as appropriate and may be None if omitted.

Attributes:

dim: 1D NumPy array of day‑in‑milk values (finite; same length as milkrecordings).
milkrecordings: 1D NumPy array of test‑day milk yields aligned to dim.
model: Lowercased model identifier or None if not provided.
fitting: "frequentist" or "bayesian" (lowercased) or None.
breed: "H" or "J" or None.
parity: Lactation number as int, if provided; otherwise None.
continent: Prior source for MilkBot API ("USA", "EU"), or None.
persistency_method: Either "derived" or "literature", or None.
lactation_length: Integer horizon (e.g., 305), the string "max", or None.
milk_unit: Either "kg" or "lb", defaulting to "kg".
custom_priors: Either a dict of priors, the string "CHEN" to use Chen et al. priors, or None if not provided.

dim: numpy.ndarray

milkrecordings: numpy.ndarray

model: str | None = None

fitting: str | None = None

breed: str | None = None

parity: int | None = None

continent: str | None = None

persistency_method: str | None = None

lactation_length: int | str | None = None

milk_unit: str | None = None

custom_priors: lactationcurve.preprocessing.validate_and_standardize.MilkBotPriors | str | None = None

def normalize_lactation_column_name(column: str) -> str: View Source

124def normalize_lactation_column_name(column: str) -> str:
125    """Normalize a lactation-data column label for alias matching."""
126    return re.sub(r"[^a-z0-9]+", "", str(column).strip().lower())

Normalize a lactation-data column label for alias matching.

def resolve_lactation_column_mapping( columns: Sequence[str], *, days_in_milk_col: str | None = None, milking_yield_col: str | None = None, test_id_col: str | None = None, require_test_id: bool = False) -> dict[str, str]: View Source

140def resolve_lactation_column_mapping(
141    columns: Sequence[str],
142    *,
143    days_in_milk_col: str | None = None,
144    milking_yield_col: str | None = None,
145    test_id_col: str | None = None,
146    require_test_id: bool = False,
147) -> dict[str, str]:
148    """Resolve flexible lactation-data headers to canonical column names.
149
150    The returned mapping is ``canonical name -> original uploaded column``.
151    ``DaysInMilk`` and ``MilkingYield`` are required because the lactation
152    calculations cannot run without them. ``TestId`` is optional by default so
153    callers can choose whether to create a single-lactation fallback.
154    """
155
156    columns_by_normalized_name: dict[str, str] = {}
157    for column in columns:
158        normalized = normalize_lactation_column_name(column)
159        if normalized and normalized not in columns_by_normalized_name:
160            columns_by_normalized_name[normalized] = column
161
162    resolved: dict[str, str] = {}
163    requested_columns = {
164        "DaysInMilk": days_in_milk_col,
165        "MilkingYield": milking_yield_col,
166        "TestId": test_id_col,
167    }
168
169    for canonical, requested in requested_columns.items():
170        if requested:
171            match = columns_by_normalized_name.get(normalize_lactation_column_name(requested))
172            if match is None:
173                raise ValueError(f"Column '{requested}' was not found.")
174        else:
175            match = _first_matching_column(
176                columns_by_normalized_name, LACTATION_COLUMN_ALIASES[canonical]
177            )
178
179        if match is not None:
180            resolved[canonical] = match
181
182    missing_required = [
183        canonical for canonical in ("DaysInMilk", "MilkingYield") if canonical not in resolved
184    ]
185    if require_test_id and "TestId" not in resolved:
186        missing_required.append("TestId")
187    if missing_required:
188        expected = ", ".join(missing_required)
189        raise ValueError(f"No lactation column found for: {expected}.")
190
191    return resolved

Resolve flexible lactation-data headers to canonical column names.

The returned mapping is canonical name -> original uploaded column. DaysInMilk and MilkingYield are required because the lactation calculations cannot run without them. TestId is optional by default so callers can choose whether to create a single-lactation fallback.

def standardize_lactation_columns( df: pandas.core.frame.DataFrame, *, days_in_milk_col: str | None = None, milking_yield_col: str | None = None, test_id_col: str | None = None, default_test_id=0, max_dim: int | str = 305) -> pandas.core.frame.DataFrame: View Source

390def standardize_lactation_columns(
391    df: pd.DataFrame,
392    *,
393    days_in_milk_col: str | None = None,
394    milking_yield_col: str | None = None,
395    test_id_col: str | None = None,
396    default_test_id=0,
397    max_dim: int | str = 305,
398) -> pd.DataFrame:
399    """
400    Standardize column names and structure for lactation data.
401
402    Returns
403    -------
404    df_out : pd.DataFrame
405        Copy of df with standardized columns:
406        - DaysInMilk
407        - MilkingYield
408        - TestId
409    """
410
411    df = df.copy()
412    mapping = resolve_lactation_column_mapping(
413        [str(col) for col in df.columns],
414        days_in_milk_col=days_in_milk_col,
415        milking_yield_col=milking_yield_col,
416        test_id_col=test_id_col,
417    )
418    dim_col = mapping["DaysInMilk"]
419    yield_col = mapping["MilkingYield"]
420    id_col = mapping.get("TestId")
421
422    # Create TestId if missing
423    if not id_col:
424        df["TestId"] = default_test_id
425        id_col = "TestId"
426
427    # Rename to standardized names
428    df = df.rename(
429        columns={
430            dim_col: "DaysInMilk",
431            yield_col: "MilkingYield",
432            id_col: "TestId",
433        }
434    )
435
436    # Filter DIM
437    if isinstance(max_dim, str) and max_dim.lower() == "max":
438        df = pd.DataFrame(df)
439    else:
440        df = pd.DataFrame(df[df["DaysInMilk"] <= int(max_dim)])
441
442    return df

Standardize column names and structure for lactation data.

Returns

df_out : pd.DataFrame Copy of df with standardized columns: - DaysInMilk - MilkingYield - TestId

def validate_and_prepare_inputs( dim, milkrecordings, model=None, fitting=None, *, breed=None, parity=None, continent=None, persistency_method=None, lactation_length=None, milk_unit='kg', custom_priors=None) -> PreparedInputs: View Source

230def validate_and_prepare_inputs(
231    dim,
232    milkrecordings,
233    model=None,
234    fitting=None,
235    *,
236    breed=None,
237    parity=None,
238    continent=None,
239    persistency_method=None,
240    lactation_length=None,
241    milk_unit="kg",
242    custom_priors=None,
243) -> PreparedInputs:
244    """
245    Validate, normalize, and clean input data for lactation curve fitting.
246
247    This function performs basic consistency checks on the provided
248    days-in-milk (DIM) and milk recording data, normalizes optional
249    categorical parameters, and removes observations with missing or
250    non-finite values. The cleaned and validated inputs are returned
251    in a structured :class:`PreparedInputs` object.
252
253    Parameters
254    ----------
255    dim : array-like
256        Days in milk corresponding to each milk recording.
257    milkrecordings : array-like
258        Milk yield measurements corresponding to `dim`.
259    model : str or None, optional
260        Name of the lactation curve model. If provided, the name is
261        stripped of whitespace and converted to lowercase.
262    fitting : str or None, optional
263        Fitting approach to be used. Must be either ``"frequentist"``
264        or ``"bayesian"`` if provided.
265    breed : str or None, optional
266        Cow breed identifier. Must be ``"H"`` (Holstein) or ``"J"``
267        (Jersey) if provided. Case-insensitive.
268    parity : int or None, optional
269        Lactation number (parity). If provided, it is coerced to an
270        integer.
271    continent : str or None, optional
272        Geographic region identifier. Must be one of ``"USA"`` or
273        ``"EU"`` if provided. Case-insensitive.
274    milk_unit : str, optional
275        Unit of milk yield measurements. Must be either ``"kg"`` or ``"lb"``. Default is ``"kg"``.
276    custom_priors : dict or str or None, optional
277        Custom prior distributions for Bayesian fitting. If a dict is provided,
278        it must be a dictionary of prior distributions for each parameter in the model.
279        If the string ``"CHEN"`` is provided, the default Chen et al. priors are used.
280
281    Extra input for persistency calculation:
282        persistency_method (String): way of calculating
283            persistency, options: 'derived' which gives the
284            average slope of the lactation after the peak until
285            the end of lactation (default) or 'literature' for
286            the wood and milkbot model.
287        Lactation_length: string or int: length of the lactation
288            in days to calculate persistency over, options:
289            305 = default or 'max' uses the maximum DIM in the
290            data, or an integer value to set the desired
291            lactation length.
292
293    Returns
294    -------
295    PreparedInputs
296        A dataclass containing the cleaned numeric arrays (`dim`,
297        `milkrecordings`) and the normalized optional parameters.
298
299    Raises
300    ------
301    ValueError
302        If input arrays have different lengths, contain insufficient
303        valid observations, or if categorical parameters are invalid.
304
305    Notes
306    -----
307    Observations with missing or non-finite values in either `dim` or
308    `milkrecordings` are removed prior to model fitting. At least two
309    valid observations are required to proceed.
310    """
311    if len(dim) != len(milkrecordings):
312        raise ValueError("dim and milkrecordings must have the same length")
313
314    model = model.strip().lower() if model else None
315
316    if parity is not None:
317        parity = int(parity)
318
319    if fitting is not None:
320        fitting = fitting.lower()
321        if fitting not in {"frequentist", "bayesian"}:
322            raise ValueError("Fitting method must be either frequentist or bayesian")
323
324    if breed is not None:
325        breed = breed.upper()
326        if breed not in {"H", "J"}:
327            raise ValueError("Breed must be either Holstein = 'H' or Jersey 'J'")
328
329    if continent is not None:
330        continent = continent.upper()
331        if continent not in {"USA", "EU"}:
332            raise ValueError("continent must be 'USA' or 'EU'")
333
334    dim = np.asarray(dim, dtype=float)
335    milkrecordings = np.asarray(milkrecordings, dtype=float)
336
337    mask = np.isfinite(dim) & np.isfinite(milkrecordings)
338    dim = dim[mask]
339    milkrecordings = milkrecordings[mask]
340
341    if len(dim) < 2:
342        raise ValueError("At least two non missing points are required to fit a lactation curve")
343
344    if persistency_method is not None:
345        persistency_method = persistency_method.lower()
346        if persistency_method not in {"derived", "literature"}:
347            raise ValueError("persistency_method must be either 'derived' or 'literature'")
348
349    if lactation_length is not None:
350        if isinstance(lactation_length, str):
351            if lactation_length.lower() != "max":
352                raise ValueError("lactation_length string option must be 'max'")
353        else:
354            lactation_length = int(lactation_length)
355
356    if milk_unit not in ["kg", "lb"]:
357        raise ValueError("milk_unit must be 'kg' or 'lb'")
358
359    if custom_priors is not None and not isinstance(custom_priors, (dict, str)):
360        raise ValueError("custom_priors must be a dict, a string, or None")
361
362    if isinstance(custom_priors, str):
363        custom_priors = custom_priors.upper()
364        if custom_priors != "CHEN":
365            raise ValueError(
366                "custom_priors string option must be"
367                " 'CHEN', self defined priors can be"
368                " provided as a dictionary through"
369                " the build_prior function"
370            )
371
372    if isinstance(custom_priors, dict):
373        custom_priors = cast(MilkBotPriors, custom_priors)
374
375    return PreparedInputs(
376        dim=dim,
377        milkrecordings=milkrecordings,
378        model=model or None,
379        fitting=fitting,
380        breed=breed,
381        parity=parity,
382        continent=continent,
383        persistency_method=persistency_method,
384        lactation_length=lactation_length,
385        milk_unit=milk_unit,
386        custom_priors=custom_priors,
387    )

Validate, normalize, and clean input data for lactation curve fitting.

This function performs basic consistency checks on the provided days-in-milk (DIM) and milk recording data, normalizes optional categorical parameters, and removes observations with missing or non-finite values. The cleaned and validated inputs are returned in a structured PreparedInputs object.

Parameters

dim : array-like Days in milk corresponding to each milk recording. milkrecordings : array-like Milk yield measurements corresponding to dim. model : str or None, optional Name of the lactation curve model. If provided, the name is stripped of whitespace and converted to lowercase. fitting : str or None, optional Fitting approach to be used. Must be either "frequentist" or "bayesian" if provided. breed : str or None, optional Cow breed identifier. Must be "H" (Holstein) or "J" (Jersey) if provided. Case-insensitive. parity : int or None, optional Lactation number (parity). If provided, it is coerced to an integer. continent : str or None, optional Geographic region identifier. Must be one of "USA" or "EU" if provided. Case-insensitive. milk_unit : str, optional Unit of milk yield measurements. Must be either "kg" or "lb". Default is "kg". custom_priors : dict or str or None, optional Custom prior distributions for Bayesian fitting. If a dict is provided, it must be a dictionary of prior distributions for each parameter in the model. If the string "CHEN" is provided, the default Chen et al. priors are used.

Extra input for persistency calculation:

persistency_method (String): way of calculating persistency, options: 'derived' which gives the average slope of the lactation after the peak until the end of lactation (default) or 'literature' for the wood and milkbot model. Lactation_length: string or int: length of the lactation in days to calculate persistency over, options: 305 = default or 'max' uses the maximum DIM in the data, or an integer value to set the desired lactation length.

Returns

PreparedInputs A dataclass containing the cleaned numeric arrays (dim, milkrecordings) and the normalized optional parameters.

Raises

ValueError If input arrays have different lengths, contain insufficient valid observations, or if categorical parameters are invalid.

Notes

Observations with missing or non-finite values in either dim or milkrecordings are removed prior to model fitting. At least two valid observations are required to proceed.