lactationcurve.preprocessing

 1from .validate_and_standardize import (
 2    PreparedInputs,
 3    standardize_lactation_columns,
 4    validate_and_prepare_inputs,
 5)
 6
 7__all__ = [
 8    "PreparedInputs",
 9    "standardize_lactation_columns",
10    "validate_and_prepare_inputs",
11]
@dataclass
class PreparedInputs:
42@dataclass
43class PreparedInputs:
44    """Normalized, ready‑to‑fit inputs.
45
46    This container is returned by `validate_and_prepare_inputs` and is the single
47    hand‑off object expected by the fitting routines. Arrays are finite and 1‑dimensional;
48    categorical fields are lower/upper‑cased as appropriate and may be `None` if omitted.
49
50    Attributes:
51        dim: 1D NumPy array of day‑in‑milk values (finite; same length as `milkrecordings`).
52        milkrecordings: 1D NumPy array of test‑day milk yields aligned to `dim`.
53        model: Lowercased model identifier or `None` if not provided.
54        fitting: `"frequentist"` or `"bayesian"` (lowercased) or `None`.
55        breed: `"H"` or `"J"` or `None`.
56        parity: Lactation number as `int`, if provided; otherwise `None`.
57        continent: Prior source flag for Bayesian flows (`"USA"`, `"EU"`, `"CHEN"`), or `None`.
58        persistency_method: Either `"derived"` or `"literature"`, or `None`.
59        lactation_length: Integer horizon (e.g., 305), the string `"max"`, or `None`.
60    """
61
62    dim: np.ndarray
63    milkrecordings: np.ndarray
64    model: str | None = None
65    fitting: str | None = None
66    breed: str | None = None
67    parity: int | None = None
68    continent: str | None = None
69    persistency_method: str | None = None
70    lactation_length: int | str | None = None

Normalized, ready‑to‑fit inputs.

This container is returned by validate_and_prepare_inputs and is the single hand‑off object expected by the fitting routines. Arrays are finite and 1‑dimensional; categorical fields are lower/upper‑cased as appropriate and may be None if omitted.

Attributes:
  • dim: 1D NumPy array of day‑in‑milk values (finite; same length as milkrecordings).
  • milkrecordings: 1D NumPy array of test‑day milk yields aligned to dim.
  • model: Lowercased model identifier or None if not provided.
  • fitting: "frequentist" or "bayesian" (lowercased) or None.
  • breed: "H" or "J" or None.
  • parity: Lactation number as int, if provided; otherwise None.
  • continent: Prior source flag for Bayesian flows ("USA", "EU", "CHEN"), or None.
  • persistency_method: Either "derived" or "literature", or None.
  • lactation_length: Integer horizon (e.g., 305), the string "max", or None.
PreparedInputs( dim: numpy.ndarray, milkrecordings: numpy.ndarray, model: str | None = None, fitting: str | None = None, breed: str | None = None, parity: int | None = None, continent: str | None = None, persistency_method: str | None = None, lactation_length: int | str | None = None)
dim: numpy.ndarray
milkrecordings: numpy.ndarray
model: str | None = None
fitting: str | None = None
breed: str | None = None
parity: int | None = None
continent: str | None = None
persistency_method: str | None = None
lactation_length: int | str | None = None
def standardize_lactation_columns( df: pandas.DataFrame, *, days_in_milk_col: str | None = None, milking_yield_col: str | None = None, test_id_col: str | None = None, default_test_id=0, max_dim: int = 305):
204def standardize_lactation_columns(
205    df: pd.DataFrame,
206    *,
207    days_in_milk_col: str | None = None,
208    milking_yield_col: str | None = None,
209    test_id_col: str | None = None,
210    default_test_id=0,
211    max_dim: int = 305,
212):
213    """
214    Standardize column names and structure for lactation data.
215
216    Returns
217    -------
218    df_out : pd.DataFrame
219        Copy of df with standardized columns:
220        - DaysInMilk
221        - MilkingYield
222        - TestId
223    """
224
225    df = df.copy()
226
227    # Accepted aliases (case-insensitive)
228    aliases = {
229        "DaysInMilk": ["daysinmilk", "dim", "testday"],
230        "MilkingYield": [
231            "milkingyield",
232            "testdaymilkyield",
233            "milkyield",
234            "yield",
235            "milkproduction",
236            "milk_yield",
237        ],
238        "TestId": ["testid", "animalid", "id"],
239    }
240
241    # Lowercase lookup → actual column name
242    col_lookup = {col.lower(): col for col in df.columns}
243
244    def resolve_col(override, possible_names):
245        if override:
246            return col_lookup.get(override.lower())
247        for name in possible_names:
248            if name in col_lookup:
249                return col_lookup[name]
250        return None
251
252    # Resolve columns
253    dim_col = resolve_col(days_in_milk_col, aliases["DaysInMilk"])
254    if not dim_col:
255        raise ValueError("No DaysInMilk column found.")
256
257    yield_col = resolve_col(milking_yield_col, aliases["MilkingYield"])
258    if not yield_col:
259        raise ValueError("No MilkingYield column found.")
260
261    id_col = resolve_col(test_id_col, aliases["TestId"])
262
263    # Create TestId if missing
264    if not id_col:
265        df["TestId"] = default_test_id
266        id_col = "TestId"
267
268    # Rename to standardized names
269    df = df.rename(
270        columns={
271            dim_col: "DaysInMilk",
272            yield_col: "MilkingYield",
273            id_col: "TestId",
274        }
275    )
276
277    # Filter DIM
278    df = pd.DataFrame(df[df["DaysInMilk"] <= max_dim])
279
280    return df

Standardize column names and structure for lactation data.

Returns

df_out : pd.DataFrame Copy of df with standardized columns: - DaysInMilk - MilkingYield - TestId

def validate_and_prepare_inputs( dim, milkrecordings, model=None, fitting=None, *, breed=None, parity=None, continent=None, persistency_method=None, lactation_length=None) -> PreparedInputs:
 73def validate_and_prepare_inputs(
 74    dim,
 75    milkrecordings,
 76    model=None,
 77    fitting=None,
 78    *,
 79    breed=None,
 80    parity=None,
 81    continent=None,
 82    persistency_method=None,
 83    lactation_length=None,
 84) -> PreparedInputs:
 85    """
 86    Validate, normalize, and clean input data for lactation curve fitting.
 87
 88    This function performs basic consistency checks on the provided
 89    days-in-milk (DIM) and milk recording data, normalizes optional
 90    categorical parameters, and removes observations with missing or
 91    non-finite values. The cleaned and validated inputs are returned
 92    in a structured :class:`PreparedInputs` object.
 93
 94    Parameters
 95    ----------
 96    dim : array-like
 97        Days in milk corresponding to each milk recording.
 98    milkrecordings : array-like
 99        Milk yield measurements corresponding to `dim`.
100    model : str or None, optional
101        Name of the lactation curve model. If provided, the name is
102        stripped of whitespace and converted to lowercase.
103    fitting : str or None, optional
104        Fitting approach to be used. Must be either ``"frequentist"``
105        or ``"bayesian"`` if provided.
106    breed : str or None, optional
107        Cow breed identifier. Must be ``"H"`` (Holstein) or ``"J"``
108        (Jersey) if provided. Case-insensitive.
109    parity : int or None, optional
110        Lactation number (parity). If provided, it is coerced to an
111        integer.
112    continent : str or None, optional
113        Geographic region identifier. Must be one of ``"USA"``,
114        ``"EU"``, or ``"CHEN"`` if provided. Case-insensitive.
115
116    Extra input for persistency calculation:
117        persistency_method (String): way of calculating
118            persistency, options: 'derived' which gives the
119            average slope of the lactation after the peak until
120            the end of lactation (default) or 'literature' for
121            the wood and milkbot model.
122        Lactation_length: string or int: length of the lactation
123            in days to calculate persistency over, options:
124            305 = default or 'max' uses the maximum DIM in the
125            data, or an integer value to set the desired
126            lactation length.
127
128    Returns
129    -------
130    PreparedInputs
131        A dataclass containing the cleaned numeric arrays (`dim`,
132        `milkrecordings`) and the normalized optional parameters.
133
134    Raises
135    ------
136    ValueError
137        If input arrays have different lengths, contain insufficient
138        valid observations, or if categorical parameters are invalid.
139
140    Notes
141    -----
142    Observations with missing or non-finite values in either `dim` or
143    `milkrecordings` are removed prior to model fitting. At least two
144    valid observations are required to proceed.
145    """
146    if len(dim) != len(milkrecordings):
147        raise ValueError("dim and milkrecordings must have the same length")
148
149    model = (model or "").strip().lower()
150
151    if parity is not None:
152        parity = int(parity)
153
154    if fitting is not None:
155        fitting = fitting.lower()
156        if fitting not in {"frequentist", "bayesian"}:
157            raise ValueError("Fitting method must be either frequentist or bayesian")
158
159    if breed is not None:
160        breed = breed.upper()
161        if breed not in {"H", "J"}:
162            raise ValueError("Breed must be either Holstein = 'H' or Jersey 'J'")
163
164    if continent is not None:
165        continent = continent.upper()
166        if continent not in {"USA", "EU", "CHEN"}:
167            raise ValueError("continent must be 'USA', 'EU', or 'CHEN'")
168
169    dim = np.asarray(dim, dtype=float)
170    milkrecordings = np.asarray(milkrecordings, dtype=float)
171
172    mask = np.isfinite(dim) & np.isfinite(milkrecordings)
173    dim = dim[mask]
174    milkrecordings = milkrecordings[mask]
175
176    if len(dim) < 2:
177        raise ValueError("At least two non missing points are required to fit a lactation curve")
178
179    if persistency_method is not None:
180        persistency_method = persistency_method.lower()
181        if persistency_method not in {"derived", "literature"}:
182            raise ValueError("persistency_method must be either 'derived' or 'literature'")
183
184    if lactation_length is not None:
185        if isinstance(lactation_length, str):
186            if lactation_length.lower() != "max":
187                raise ValueError("lactation_length string option must be 'max'")
188        else:
189            lactation_length = int(lactation_length)
190
191    return PreparedInputs(
192        dim=dim,
193        milkrecordings=milkrecordings,
194        model=model or None,
195        fitting=fitting,
196        breed=breed,
197        parity=parity,
198        continent=continent,
199        persistency_method=persistency_method,
200        lactation_length=lactation_length,
201    )

Validate, normalize, and clean input data for lactation curve fitting.

This function performs basic consistency checks on the provided days-in-milk (DIM) and milk recording data, normalizes optional categorical parameters, and removes observations with missing or non-finite values. The cleaned and validated inputs are returned in a structured PreparedInputs object.

Parameters

dim : array-like Days in milk corresponding to each milk recording. milkrecordings : array-like Milk yield measurements corresponding to dim. model : str or None, optional Name of the lactation curve model. If provided, the name is stripped of whitespace and converted to lowercase. fitting : str or None, optional Fitting approach to be used. Must be either "frequentist" or "bayesian" if provided. breed : str or None, optional Cow breed identifier. Must be "H" (Holstein) or "J" (Jersey) if provided. Case-insensitive. parity : int or None, optional Lactation number (parity). If provided, it is coerced to an integer. continent : str or None, optional Geographic region identifier. Must be one of "USA", "EU", or "CHEN" if provided. Case-insensitive.

Extra input for persistency calculation:

persistency_method (String): way of calculating persistency, options: 'derived' which gives the average slope of the lactation after the peak until the end of lactation (default) or 'literature' for the wood and milkbot model. Lactation_length: string or int: length of the lactation in days to calculate persistency over, options: 305 = default or 'max' uses the maximum DIM in the data, or an integer value to set the desired lactation length.

Returns

PreparedInputs A dataclass containing the cleaned numeric arrays (dim, milkrecordings) and the normalized optional parameters.

Raises

ValueError If input arrays have different lengths, contain insufficient valid observations, or if categorical parameters are invalid.

Notes

Observations with missing or non-finite values in either dim or milkrecordings are removed prior to model fitting. At least two valid observations are required to proceed.