Full API Reference

This section provides a detailed reference to all objects defined in NumerBlox.

Download

`BaseDownloader`

Bases: BaseIO

Abstract base class for downloaders.

:param directory_path: Base folder to download files to.

Source code in src/numerblox/download.py

class BaseDownloader(BaseIO):
    """
    Abstract base class for downloaders.

    :param directory_path: Base folder to download files to.
    """

    def __init__(self, directory_path: str):
        super().__init__(directory_path=directory_path)

    @abstractmethod
    def download_training_data(self, *args, **kwargs):
        """Download all necessary files needed for training."""
        ...

    @abstractmethod
    def download_live_data(self, *args, **kwargs):
        """Download minimal amount of files needed for weekly inference."""
        ...

    @staticmethod
    def _load_json(file_path: str, verbose=False, *args, **kwargs) -> dict:
        """Load JSON from file and return as dictionary."""
        with open(Path(file_path)) as json_file:
            json_data = json.load(json_file, *args, **kwargs)
        if verbose:
            print(json_data)
        return json_data

    def _default_save_path(self, start: dt, end: dt, backend: str):
        """Save to downloader directory indicating backend, start date and end date as parquet file."""
        return f"{self.dir}/{backend}_{start.strftime('%Y%m%d')}_{end.strftime('%Y%m%d')}.parquet"

    def __call__(self, *args, **kwargs):
        """
        The most common use case will be to get weekly inference data. So calling the class itself returns inference data.
        """
        self.download_live_data(*args, **kwargs)

`call(*args, **kwargs)`

The most common use case will be to get weekly inference data. So calling the class itself returns inference data.

Source code in src/numerblox/download.py

def __call__(self, *args, **kwargs):
    """
    The most common use case will be to get weekly inference data. So calling the class itself returns inference data.
    """
    self.download_live_data(*args, **kwargs)

`_default_save_path(start, end, backend)`

Save to downloader directory indicating backend, start date and end date as parquet file.

Source code in src/numerblox/download.py

def _default_save_path(self, start: dt, end: dt, backend: str):
    """Save to downloader directory indicating backend, start date and end date as parquet file."""
    return f"{self.dir}/{backend}_{start.strftime('%Y%m%d')}_{end.strftime('%Y%m%d')}.parquet"

`_load_json(file_path, verbose=False, *args, **kwargs)` `staticmethod`

Load JSON from file and return as dictionary.

Source code in src/numerblox/download.py

@staticmethod
def _load_json(file_path: str, verbose=False, *args, **kwargs) -> dict:
    """Load JSON from file and return as dictionary."""
    with open(Path(file_path)) as json_file:
        json_data = json.load(json_file, *args, **kwargs)
    if verbose:
        print(json_data)
    return json_data

`download_live_data(*args, **kwargs)` `abstractmethod`

Download minimal amount of files needed for weekly inference.

Source code in src/numerblox/download.py

@abstractmethod
def download_live_data(self, *args, **kwargs):
    """Download minimal amount of files needed for weekly inference."""
    ...

`download_training_data(*args, **kwargs)` `abstractmethod`

Download all necessary files needed for training.

Source code in src/numerblox/download.py

@abstractmethod
def download_training_data(self, *args, **kwargs):
    """Download all necessary files needed for training."""
    ...

`BaseIO`

Bases: ABC

Basic functionality for IO (downloading and uploading).

:param directory_path: Base folder for IO. Will be created if it does not exist.

Source code in src/numerblox/download.py

class BaseIO(ABC):
    """
    Basic functionality for IO (downloading and uploading).

    :param directory_path: Base folder for IO. Will be created if it does not exist.
    """

    def __init__(self, directory_path: str):
        self.dir = Path(directory_path)
        self._create_directory()

    def remove_base_directory(self):
        """Remove directory with all contents."""
        abs_path = self.dir.resolve()
        print(f"WARNING: Deleting directory for '{self.__class__.__name__}'\nPath: '{abs_path}'")
        shutil.rmtree(abs_path)

    def download_file_from_gcs(self, bucket_name: str, gcs_path: str):
        """
        Get file from GCS bucket and download to local directory.
        :param gcs_path: Path to file on GCS bucket.
        """
        blob_path = str(self.dir.resolve())
        blob = self._get_gcs_blob(bucket_name=bucket_name, blob_path=blob_path)
        blob.download_to_filename(gcs_path)
        print(f"Downloaded GCS object '{gcs_path}' from bucket '{blob.bucket.id}' to local directory '{blob_path}'.")

    def upload_file_to_gcs(self, bucket_name: str, gcs_path: str, local_path: str):
        """
        Upload file to some GCS bucket.
        :param gcs_path: Path to file on GCS bucket.
        """
        blob = self._get_gcs_blob(bucket_name=bucket_name, blob_path=gcs_path)
        blob.upload_from_filename(local_path)
        print(f"Local file '{local_path}' uploaded to '{gcs_path}' in bucket {blob.bucket.id}")

    def download_directory_from_gcs(self, bucket_name: str, gcs_path: str):
        """
        Copy full directory from GCS bucket to local environment.
        :param gcs_path: Name of directory on GCS bucket.
        """
        blob_path = str(self.dir.resolve())
        blob = self._get_gcs_blob(bucket_name=bucket_name, blob_path=blob_path)
        for gcs_file in glob.glob(gcs_path + "/**", recursive=True):
            if os.path.isfile(gcs_file):
                blob.download_to_filename(blob_path)
        print(f"Directory '{gcs_path}' from bucket '{blob.bucket.id}' downloaded to '{blob_path}'")

    def upload_directory_to_gcs(self, bucket_name: str, gcs_path: str):
        """
        Upload full base directory to GCS bucket.
        :param gcs_path: Name of directory on GCS bucket.
        """
        blob = self._get_gcs_blob(bucket_name=bucket_name, blob_path=gcs_path)
        for local_path in glob.glob(str(self.dir) + "/**", recursive=True):
            if os.path.isfile(local_path):
                blob.upload_from_filename(local_path)
        print(f"Directory '{self.dir}' uploaded to '{gcs_path}' in bucket {blob.bucket.id}")

    def _get_gcs_blob(self, bucket_name: str, blob_path: str) -> storage.Blob:
        """Create blob that interacts with Google Cloud Storage (GCS)."""
        client = storage.Client()
        # https://console.cloud.google.com/storage/browser/[bucket_name]
        bucket = client.get_bucket(bucket_name)
        blob = bucket.blob(blob_path)
        return blob

    def _append_folder(self, folder: str) -> Path:
        """
        Return base directory Path object appended with 'folder'.
        Create directory if it does not exist.
        """
        dir = Path(self.dir / folder)
        dir.mkdir(parents=True, exist_ok=True)
        return dir

    def _get_dest_path(self, subfolder: str, filename: str) -> str:
        """Prepare destination path for downloading."""
        dir = self._append_folder(subfolder)
        dest_path = str(dir.joinpath(filename.split("/")[-1]))
        return dest_path

    def _create_directory(self):
        """Create base directory if it does not exist."""
        if not self.dir.is_dir():
            print(f"No existing directory found at '{self.dir}'. Creating directory...")
            self.dir.mkdir(parents=True, exist_ok=True)

    @property
    def get_all_files(self) -> list:
        """Return all paths of contents in directory."""
        return list(self.dir.iterdir())

    @property
    def is_empty(self) -> bool:
        """Check if directory is empty."""
        return not bool(self.get_all_files)

`get_all_files` `property`

Return all paths of contents in directory.

`is_empty` `property`

Check if directory is empty.

`_append_folder(folder)`

Return base directory Path object appended with 'folder'. Create directory if it does not exist.

Source code in src/numerblox/download.py

def _append_folder(self, folder: str) -> Path:
    """
    Return base directory Path object appended with 'folder'.
    Create directory if it does not exist.
    """
    dir = Path(self.dir / folder)
    dir.mkdir(parents=True, exist_ok=True)
    return dir

`_create_directory()`

Create base directory if it does not exist.

Source code in src/numerblox/download.py

def _create_directory(self):
    """Create base directory if it does not exist."""
    if not self.dir.is_dir():
        print(f"No existing directory found at '{self.dir}'. Creating directory...")
        self.dir.mkdir(parents=True, exist_ok=True)

`_get_dest_path(subfolder, filename)`

Prepare destination path for downloading.

Source code in src/numerblox/download.py

def _get_dest_path(self, subfolder: str, filename: str) -> str:
    """Prepare destination path for downloading."""
    dir = self._append_folder(subfolder)
    dest_path = str(dir.joinpath(filename.split("/")[-1]))
    return dest_path

`_get_gcs_blob(bucket_name, blob_path)`

Create blob that interacts with Google Cloud Storage (GCS).

Source code in src/numerblox/download.py

def _get_gcs_blob(self, bucket_name: str, blob_path: str) -> storage.Blob:
    """Create blob that interacts with Google Cloud Storage (GCS)."""
    client = storage.Client()
    # https://console.cloud.google.com/storage/browser/[bucket_name]
    bucket = client.get_bucket(bucket_name)
    blob = bucket.blob(blob_path)
    return blob

`download_directory_from_gcs(bucket_name, gcs_path)`

Copy full directory from GCS bucket to local environment. :param gcs_path: Name of directory on GCS bucket.

Source code in src/numerblox/download.py

def download_directory_from_gcs(self, bucket_name: str, gcs_path: str):
    """
    Copy full directory from GCS bucket to local environment.
    :param gcs_path: Name of directory on GCS bucket.
    """
    blob_path = str(self.dir.resolve())
    blob = self._get_gcs_blob(bucket_name=bucket_name, blob_path=blob_path)
    for gcs_file in glob.glob(gcs_path + "/**", recursive=True):
        if os.path.isfile(gcs_file):
            blob.download_to_filename(blob_path)
    print(f"Directory '{gcs_path}' from bucket '{blob.bucket.id}' downloaded to '{blob_path}'")

`download_file_from_gcs(bucket_name, gcs_path)`

Get file from GCS bucket and download to local directory. :param gcs_path: Path to file on GCS bucket.

Source code in src/numerblox/download.py

def download_file_from_gcs(self, bucket_name: str, gcs_path: str):
    """
    Get file from GCS bucket and download to local directory.
    :param gcs_path: Path to file on GCS bucket.
    """
    blob_path = str(self.dir.resolve())
    blob = self._get_gcs_blob(bucket_name=bucket_name, blob_path=blob_path)
    blob.download_to_filename(gcs_path)
    print(f"Downloaded GCS object '{gcs_path}' from bucket '{blob.bucket.id}' to local directory '{blob_path}'.")

`remove_base_directory()`

Remove directory with all contents.

Source code in src/numerblox/download.py

def remove_base_directory(self):
    """Remove directory with all contents."""
    abs_path = self.dir.resolve()
    print(f"WARNING: Deleting directory for '{self.__class__.__name__}'\nPath: '{abs_path}'")
    shutil.rmtree(abs_path)

`upload_directory_to_gcs(bucket_name, gcs_path)`

Upload full base directory to GCS bucket. :param gcs_path: Name of directory on GCS bucket.

Source code in src/numerblox/download.py

def upload_directory_to_gcs(self, bucket_name: str, gcs_path: str):
    """
    Upload full base directory to GCS bucket.
    :param gcs_path: Name of directory on GCS bucket.
    """
    blob = self._get_gcs_blob(bucket_name=bucket_name, blob_path=gcs_path)
    for local_path in glob.glob(str(self.dir) + "/**", recursive=True):
        if os.path.isfile(local_path):
            blob.upload_from_filename(local_path)
    print(f"Directory '{self.dir}' uploaded to '{gcs_path}' in bucket {blob.bucket.id}")

`upload_file_to_gcs(bucket_name, gcs_path, local_path)`

Upload file to some GCS bucket. :param gcs_path: Path to file on GCS bucket.

Source code in src/numerblox/download.py

def upload_file_to_gcs(self, bucket_name: str, gcs_path: str, local_path: str):
    """
    Upload file to some GCS bucket.
    :param gcs_path: Path to file on GCS bucket.
    """
    blob = self._get_gcs_blob(bucket_name=bucket_name, blob_path=gcs_path)
    blob.upload_from_filename(local_path)
    print(f"Local file '{local_path}' uploaded to '{gcs_path}' in bucket {blob.bucket.id}")

`EODDownloader`

Bases: BaseDownloader

Download data from EOD historical data.

More info: https://eodhistoricaldata.com/

Make sure you have the underlying Python package installed. pip install eod.

:param directory_path: Base folder to download files to.

:param key: Valid EOD client key.

:param tickers: List of valid EOD tickers (Bloomberg ticker format).

:param frequency: Choose from [d, w, m].

Daily data by default.

Source code in src/numerblox/download.py

class EODDownloader(BaseDownloader):
    """
    Download data from EOD historical data. \n
    More info: https://eodhistoricaldata.com/

    Make sure you have the underlying Python package installed.
    `pip install eod`.

    :param directory_path: Base folder to download files to. \n
    :param key: Valid EOD client key. \n
    :param tickers: List of valid EOD tickers (Bloomberg ticker format). \n
    :param frequency: Choose from [d, w, m]. \n
    Daily data by default.
    """

    def __init__(self, directory_path: str, key: str, tickers: list, frequency: str = "d"):
        super().__init__(directory_path=directory_path)
        self.key = key
        self.tickers = tickers
        try:
            from eod import EodHistoricalData
        except ImportError:
            raise ImportError("Could not import eod package. Please install eod package with 'pip install eod'")
        self.client = EodHistoricalData(self.key)
        self.frequency = frequency
        self.current_time = dt.now()
        self.end_date = self.current_time.strftime("%Y-%m-%d")
        self.cpu_count = os.cpu_count()
        # Time to sleep in between API calls to avoid hitting EOD rate limits.
        # EOD rate limit is set at 1000 calls per minute.
        self.sleep_time = self.cpu_count / 32

    def download_live_data(self):
        """Download one year of data for defined tickers."""
        start = (pd.Timestamp(self.current_time) - relativedelta(years=1)).strftime("%Y-%m-%d")
        dataf = self.get_numerframe_data(start=start)
        dataf.to_parquet(self._default_save_path(start=pd.Timestamp(start), end=pd.Timestamp(self.end_date), backend="eod"))

    def download_training_data(self, start: str = None):
        """
        Download full date length available.
        start: Starting data in %Y-%m-%d format.
        """
        start = start if start else "1970-01-01"
        dataf = self.generate_full_dataf(start=start)
        dataf.to_parquet(self._default_save_path(start=pd.Timestamp(start), end=pd.Timestamp(self.end_date), backend="eod"))

    def get_numerframe_data(self, start: str) -> NumerFrame:
        """
        Get NumerFrame data from some starting date.
        start: Starting data in %Y-%m-%d format.
        """
        dataf = self.generate_full_dataf(start=start)
        return NumerFrame(dataf)

    def generate_full_dataf(self, start: str) -> pd.DataFrame:
        """
        Collect all price data for list of EOD ticker symbols (Bloomberg tickers).
        start: Starting data in %Y-%m-%d format.
        """
        price_datafs = []
        with ThreadPoolExecutor(max_workers=self.cpu_count) as executor:
            tasks = [executor.submit(self.generate_stock_dataf, ticker, start) for ticker in self.tickers]
            for task in tqdm(concurrent.futures.as_completed(tasks), total=len(self.tickers), desc="EOD price data extraction"):
                price_datafs.append(task.result())
        return pd.concat(price_datafs)

    def generate_stock_dataf(self, ticker: str, start: str) -> pd.DataFrame:
        """
        Generate Price DataFrame for a single ticker.
        ticker: EOD ticker symbol (Bloomberg tickers).
        For example, Apple stock = AAPL.US.
        start: Starting data in %Y-%m-%d format.
        """
        time.sleep(self.sleep_time)
        try:
            resp = self.client.get_prices_eod(ticker, period=self.frequency, from_=start, to=self.end_date)
            stock_df = pd.DataFrame(resp).set_index("date")
            stock_df["ticker"] = ticker
        except Exception as e:
            print(f"WARNING: Date pull failed on ticker: '{ticker}'. Exception: {e}")
            stock_df = pd.DataFrame()
        return stock_df

`download_live_data()`

Download one year of data for defined tickers.

Source code in src/numerblox/download.py

def download_live_data(self):
    """Download one year of data for defined tickers."""
    start = (pd.Timestamp(self.current_time) - relativedelta(years=1)).strftime("%Y-%m-%d")
    dataf = self.get_numerframe_data(start=start)
    dataf.to_parquet(self._default_save_path(start=pd.Timestamp(start), end=pd.Timestamp(self.end_date), backend="eod"))

`download_training_data(start=None)`

Download full date length available. start: Starting data in %Y-%m-%d format.

Source code in src/numerblox/download.py

def download_training_data(self, start: str = None):
    """
    Download full date length available.
    start: Starting data in %Y-%m-%d format.
    """
    start = start if start else "1970-01-01"
    dataf = self.generate_full_dataf(start=start)
    dataf.to_parquet(self._default_save_path(start=pd.Timestamp(start), end=pd.Timestamp(self.end_date), backend="eod"))

`generate_full_dataf(start)`

Collect all price data for list of EOD ticker symbols (Bloomberg tickers). start: Starting data in %Y-%m-%d format.

Source code in src/numerblox/download.py

def generate_full_dataf(self, start: str) -> pd.DataFrame:
    """
    Collect all price data for list of EOD ticker symbols (Bloomberg tickers).
    start: Starting data in %Y-%m-%d format.
    """
    price_datafs = []
    with ThreadPoolExecutor(max_workers=self.cpu_count) as executor:
        tasks = [executor.submit(self.generate_stock_dataf, ticker, start) for ticker in self.tickers]
        for task in tqdm(concurrent.futures.as_completed(tasks), total=len(self.tickers), desc="EOD price data extraction"):
            price_datafs.append(task.result())
    return pd.concat(price_datafs)

`generate_stock_dataf(ticker, start)`

Generate Price DataFrame for a single ticker. ticker: EOD ticker symbol (Bloomberg tickers). For example, Apple stock = AAPL.US. start: Starting data in %Y-%m-%d format.

Source code in src/numerblox/download.py

def generate_stock_dataf(self, ticker: str, start: str) -> pd.DataFrame:
    """
    Generate Price DataFrame for a single ticker.
    ticker: EOD ticker symbol (Bloomberg tickers).
    For example, Apple stock = AAPL.US.
    start: Starting data in %Y-%m-%d format.
    """
    time.sleep(self.sleep_time)
    try:
        resp = self.client.get_prices_eod(ticker, period=self.frequency, from_=start, to=self.end_date)
        stock_df = pd.DataFrame(resp).set_index("date")
        stock_df["ticker"] = ticker
    except Exception as e:
        print(f"WARNING: Date pull failed on ticker: '{ticker}'. Exception: {e}")
        stock_df = pd.DataFrame()
    return stock_df

`get_numerframe_data(start)`

Get NumerFrame data from some starting date. start: Starting data in %Y-%m-%d format.

Source code in src/numerblox/download.py

def get_numerframe_data(self, start: str) -> NumerFrame:
    """
    Get NumerFrame data from some starting date.
    start: Starting data in %Y-%m-%d format.
    """
    dataf = self.generate_full_dataf(start=start)
    return NumerFrame(dataf)

`KaggleDownloader`

Bases: BaseDownloader

Download financial data from Kaggle.

For authentication, make sure you have a directory called .kaggle in your home directory with therein a kaggle.json file. kaggle.json should have the following structure:

{"username": USERNAME, "key": KAGGLE_API_KEY}

More info on authentication: github.com/Kaggle/kaggle-api#api-credentials

More info on the Kaggle Python API: kaggle.com/donkeys/kaggle-python-api

:param directory_path: Base folder to download files to.

Source code in src/numerblox/download.py

class KaggleDownloader(BaseDownloader):
    """
    Download financial data from Kaggle.

    For authentication, make sure you have a directory called .kaggle in your home directory
    with therein a kaggle.json file. kaggle.json should have the following structure: \n
    `{"username": USERNAME, "key": KAGGLE_API_KEY}` \n
    More info on authentication: github.com/Kaggle/kaggle-api#api-credentials \n

    More info on the Kaggle Python API: kaggle.com/donkeys/kaggle-python-api \n

    :param directory_path: Base folder to download files to.
    """

    def __init__(self, directory_path: str):
        self.__check_kaggle_import()
        super().__init__(directory_path=directory_path)

    def download_live_data(self, kaggle_dataset_path: str):
        """
        Download arbitrary Kaggle dataset.
        :param kaggle_dataset_path: Path on Kaggle (URL slug on kaggle.com/)
        """
        self.download_training_data(kaggle_dataset_path)

    def download_training_data(self, kaggle_dataset_path: str):
        """
        Download arbitrary Kaggle dataset.
        :param kaggle_dataset_path: Path on Kaggle (URL slug on kaggle.com/)
        """
        import kaggle

        kaggle.api.dataset_download_files(kaggle_dataset_path, path=self.dir, unzip=True)

    @staticmethod
    def __check_kaggle_import():
        try:
            pass
        except OSError:
            raise OSError("Could not find kaggle.json credentials. Make sure it's located in /home/runner/.kaggle. Or use the environment method. Check github.com/Kaggle/kaggle-api#api-credentials for more information on authentication.")

`download_live_data(kaggle_dataset_path)`

Download arbitrary Kaggle dataset. :param kaggle_dataset_path: Path on Kaggle (URL slug on kaggle.com/)

Source code in src/numerblox/download.py

def download_live_data(self, kaggle_dataset_path: str):
    """
    Download arbitrary Kaggle dataset.
    :param kaggle_dataset_path: Path on Kaggle (URL slug on kaggle.com/)
    """
    self.download_training_data(kaggle_dataset_path)

`download_training_data(kaggle_dataset_path)`

Download arbitrary Kaggle dataset. :param kaggle_dataset_path: Path on Kaggle (URL slug on kaggle.com/)

Source code in src/numerblox/download.py

def download_training_data(self, kaggle_dataset_path: str):
    """
    Download arbitrary Kaggle dataset.
    :param kaggle_dataset_path: Path on Kaggle (URL slug on kaggle.com/)
    """
    import kaggle

    kaggle.api.dataset_download_files(kaggle_dataset_path, path=self.dir, unzip=True)

`NumeraiClassicDownloader`

Bases: BaseDownloader

Download from NumerAPI for Numerai Classic data. More information: https://numer.ai/data

:param directory_path: Base folder to download files to. All kwargs will be passed to NumerAPI initialization.

Source code in src/numerblox/download.py

class NumeraiClassicDownloader(BaseDownloader):
    """
    Download from NumerAPI for Numerai Classic data.
    More information: https://numer.ai/data

    :param directory_path: Base folder to download files to.
    All kwargs will be passed to NumerAPI initialization.
    """

    TRAIN_DATASET_NAME = "train.parquet"
    VALIDATION_DATASET_NAME = "validation.parquet"
    LIVE_DATASET_NAME = "live.parquet"
    LIVE_EXAMPLE_PREDS_NAME = "live_example_preds.parquet"
    VALIDATION_EXAMPLE_PREDS_NAME = "validation_example_preds.parquet"

    def __init__(self, directory_path: str, **kwargs):
        super().__init__(directory_path=directory_path)
        self.napi = NumerAPI(**kwargs)
        # Get all available versions for Numerai Classic.
        self.dataset_versions = set(s.split("/")[0] for s in self.napi.list_datasets())
        self.dataset_versions.discard("signals")

    def download_training_data(self, subfolder: str = "", version: str = "5.0"):
        """
        Get Numerai classic training and validation data.
        :param subfolder: Specify folder to create folder within base directory root.
        Saves in base directory root by default.
        :param version: Numerai dataset version.
        5.0 = Atlas (default)
        """
        self._check_dataset_version(version)
        train_val_files = [f"v{version}/{self.TRAIN_DATASET_NAME}", f"v{version}/{self.VALIDATION_DATASET_NAME}"]
        for file in train_val_files:
            dest_path = self._get_dest_path(subfolder, file)
            self.download_single_dataset(filename=file, dest_path=dest_path)

    def download_single_dataset(self, filename: str, dest_path: str, round_num: int = None):
        """
        Download one of the available datasets through NumerAPI.

        :param filename: Name as listed in NumerAPI (Check NumerAPI().list_datasets() for full overview)
        :param dest_path: Full path where file will be saved.
        :param round_num: Numerai tournament round number. Downloads latest round by default.
        """
        print(f"Downloading '{filename}'.")
        self.napi.download_dataset(filename=filename, dest_path=dest_path, round_num=round_num)

    def download_live_data(self, subfolder: str = "", version: str = "5.0", round_num: int = None):
        """
        Download all live data in specified folder for given version (i.e. minimal data needed for inference).

        :param subfolder: Specify folder to create folder within directory root.
        Saves in directory root by default.
        :param version: Numerai dataset version.
        5.0 = Atlas (default)
        :param round_num: Numerai tournament round number. Downloads latest round by default.
        """
        self._check_dataset_version(version)
        live_files = [f"v{version}/{self.LIVE_DATASET_NAME}"]
        for file in live_files:
            dest_path = self._get_dest_path(subfolder, file)
            self.download_single_dataset(filename=file, dest_path=dest_path, round_num=round_num)

    def download_example_data(self, subfolder: str = "", version: str = "5.0", round_num: int = None):
        """
        Download all example prediction data in specified folder for given version.

        :param subfolder: Specify folder to create folder within base directory root.
        Saves in base directory root by default.
        :param version: Numerai dataset version.
        5.0 = Atlas (default)
        :param round_num: Numerai tournament round number. Downloads latest round by default.
        """
        self._check_dataset_version(version)
        example_files = [f"v{version}/{self.LIVE_EXAMPLE_PREDS_NAME}", f"v{version}/{self.VALIDATION_EXAMPLE_PREDS_NAME}"]
        for file in example_files:
            dest_path = self._get_dest_path(subfolder, file)
            self.download_single_dataset(filename=file, dest_path=dest_path, round_num=round_num)

    def get_classic_features(self, subfolder: str = "", filename="v5.0/features.json", *args, **kwargs) -> dict:
        """
        Download feature overview (stats and feature sets) through NumerAPI and load as dict.
        :param subfolder: Specify folder to create folder within base directory root.
        Saves in base directory root by default.
        :param filename: name for feature overview.
        *args, **kwargs will be passed to the JSON loader.
        :return: Feature overview dict
        """
        version = filename.split("/")[0].replace("v", "")
        self._check_dataset_version(version)
        dest_path = self._get_dest_path(subfolder, filename)
        self.download_single_dataset(filename=filename, dest_path=dest_path)
        json_data = self._load_json(dest_path, *args, **kwargs)
        return json_data

    def download_meta_model_preds(self, subfolder: str = "", filename="v4.3/meta_model.parquet") -> pd.DataFrame:
        """
        Download Meta model predictions through NumerAPI.
        :param subfolder: Specify folder to create folder within base directory root.
        Saves in base directory root by default.
        :param filename: name for meta model predictions file.
        :return: Meta model predictions as DataFrame.
        """
        version = filename.split("/")[0].replace("v", "")
        self._check_dataset_version(version)
        dest_path = self._get_dest_path(subfolder, filename)
        self.download_single_dataset(
            filename=filename,
            dest_path=dest_path,
        )
        return pd.read_parquet(dest_path)

    def _check_dataset_version(self, version: str):
        assert f"v{version}" in self.dataset_versions, f"Version '{version}' is not available in NumerAPI."

`download_example_data(subfolder='', version='5.0', round_num=None)`

Download all example prediction data in specified folder for given version.

:param subfolder: Specify folder to create folder within base directory root. Saves in base directory root by default. :param version: Numerai dataset version. 5.0 = Atlas (default) :param round_num: Numerai tournament round number. Downloads latest round by default.

Source code in src/numerblox/download.py

def download_example_data(self, subfolder: str = "", version: str = "5.0", round_num: int = None):
    """
    Download all example prediction data in specified folder for given version.

    :param subfolder: Specify folder to create folder within base directory root.
    Saves in base directory root by default.
    :param version: Numerai dataset version.
    5.0 = Atlas (default)
    :param round_num: Numerai tournament round number. Downloads latest round by default.
    """
    self._check_dataset_version(version)
    example_files = [f"v{version}/{self.LIVE_EXAMPLE_PREDS_NAME}", f"v{version}/{self.VALIDATION_EXAMPLE_PREDS_NAME}"]
    for file in example_files:
        dest_path = self._get_dest_path(subfolder, file)
        self.download_single_dataset(filename=file, dest_path=dest_path, round_num=round_num)

`download_live_data(subfolder='', version='5.0', round_num=None)`

Download all live data in specified folder for given version (i.e. minimal data needed for inference).

:param subfolder: Specify folder to create folder within directory root. Saves in directory root by default. :param version: Numerai dataset version. 5.0 = Atlas (default) :param round_num: Numerai tournament round number. Downloads latest round by default.

Source code in src/numerblox/download.py

def download_live_data(self, subfolder: str = "", version: str = "5.0", round_num: int = None):
    """
    Download all live data in specified folder for given version (i.e. minimal data needed for inference).

    :param subfolder: Specify folder to create folder within directory root.
    Saves in directory root by default.
    :param version: Numerai dataset version.
    5.0 = Atlas (default)
    :param round_num: Numerai tournament round number. Downloads latest round by default.
    """
    self._check_dataset_version(version)
    live_files = [f"v{version}/{self.LIVE_DATASET_NAME}"]
    for file in live_files:
        dest_path = self._get_dest_path(subfolder, file)
        self.download_single_dataset(filename=file, dest_path=dest_path, round_num=round_num)

`download_meta_model_preds(subfolder='', filename='v4.3/meta_model.parquet')`

Download Meta model predictions through NumerAPI. :param subfolder: Specify folder to create folder within base directory root. Saves in base directory root by default. :param filename: name for meta model predictions file. :return: Meta model predictions as DataFrame.

Source code in src/numerblox/download.py

def download_meta_model_preds(self, subfolder: str = "", filename="v4.3/meta_model.parquet") -> pd.DataFrame:
    """
    Download Meta model predictions through NumerAPI.
    :param subfolder: Specify folder to create folder within base directory root.
    Saves in base directory root by default.
    :param filename: name for meta model predictions file.
    :return: Meta model predictions as DataFrame.
    """
    version = filename.split("/")[0].replace("v", "")
    self._check_dataset_version(version)
    dest_path = self._get_dest_path(subfolder, filename)
    self.download_single_dataset(
        filename=filename,
        dest_path=dest_path,
    )
    return pd.read_parquet(dest_path)

`download_single_dataset(filename, dest_path, round_num=None)`

Download one of the available datasets through NumerAPI.

:param filename: Name as listed in NumerAPI (Check NumerAPI().list_datasets() for full overview) :param dest_path: Full path where file will be saved. :param round_num: Numerai tournament round number. Downloads latest round by default.

Source code in src/numerblox/download.py

def download_single_dataset(self, filename: str, dest_path: str, round_num: int = None):
    """
    Download one of the available datasets through NumerAPI.

    :param filename: Name as listed in NumerAPI (Check NumerAPI().list_datasets() for full overview)
    :param dest_path: Full path where file will be saved.
    :param round_num: Numerai tournament round number. Downloads latest round by default.
    """
    print(f"Downloading '{filename}'.")
    self.napi.download_dataset(filename=filename, dest_path=dest_path, round_num=round_num)

`download_training_data(subfolder='', version='5.0')`

Get Numerai classic training and validation data. :param subfolder: Specify folder to create folder within base directory root. Saves in base directory root by default. :param version: Numerai dataset version. 5.0 = Atlas (default)

Source code in src/numerblox/download.py

def download_training_data(self, subfolder: str = "", version: str = "5.0"):
    """
    Get Numerai classic training and validation data.
    :param subfolder: Specify folder to create folder within base directory root.
    Saves in base directory root by default.
    :param version: Numerai dataset version.
    5.0 = Atlas (default)
    """
    self._check_dataset_version(version)
    train_val_files = [f"v{version}/{self.TRAIN_DATASET_NAME}", f"v{version}/{self.VALIDATION_DATASET_NAME}"]
    for file in train_val_files:
        dest_path = self._get_dest_path(subfolder, file)
        self.download_single_dataset(filename=file, dest_path=dest_path)

`get_classic_features(subfolder='', filename='v5.0/features.json', *args, **kwargs)`

Download feature overview (stats and feature sets) through NumerAPI and load as dict. :param subfolder: Specify folder to create folder within base directory root. Saves in base directory root by default. :param filename: name for feature overview. args, *kwargs will be passed to the JSON loader. :return: Feature overview dict

Source code in src/numerblox/download.py

def get_classic_features(self, subfolder: str = "", filename="v5.0/features.json", *args, **kwargs) -> dict:
    """
    Download feature overview (stats and feature sets) through NumerAPI and load as dict.
    :param subfolder: Specify folder to create folder within base directory root.
    Saves in base directory root by default.
    :param filename: name for feature overview.
    *args, **kwargs will be passed to the JSON loader.
    :return: Feature overview dict
    """
    version = filename.split("/")[0].replace("v", "")
    self._check_dataset_version(version)
    dest_path = self._get_dest_path(subfolder, filename)
    self.download_single_dataset(filename=filename, dest_path=dest_path)
    json_data = self._load_json(dest_path, *args, **kwargs)
    return json_data

`NumeraiCryptoDownloader`

Bases: BaseDownloader

Download Numerai Crypto data. More information: https://crypto.numer.ai/data

:param directory_path: Base folder to download files to.

Source code in src/numerblox/download.py

class NumeraiCryptoDownloader(BaseDownloader):
    """
    Download Numerai Crypto data.
    More information: https://crypto.numer.ai/data

    :param directory_path: Base folder to download files to.
    """

    LIVE_DATASET_NAME = "live_universe.parquet"
    TRAIN_TARGETS_NAME = "train_targets.parquet"

    def __init__(self, directory_path: str, **kwargs):
        super().__init__(directory_path=directory_path)
        self.capi = CryptoAPI(**kwargs)
        self.dataset_versions = ["v1.0"]

    def download_training_data(
        self,
        subfolder: str = "",
        version: str = "1.0",
    ):
        """
        Download all training data in specified folder for given version.

        :param subfolder: Specify folder to create folder within directory root.
        Saves in directory root by default.
        :param version: Numerai dataset version.
        """
        self._check_dataset_version(version)
        training_files = [f"crypto/v{version}/{self.TRAIN_TARGETS_NAME}"]
        for file in training_files:
            dest_path = self._get_dest_path(subfolder, file)
            self.download_single_dataset(
                filename=file,
                dest_path=dest_path,
            )

    def download_live_data(
        self,
        subfolder: str = "",
        version: str = "1.0",
    ):
        """
        Download all live data in specified folder (i.e. minimal data needed for inference).

        :param subfolder: Specify folder to create folder within directory root.
        Saves in directory root by default.
        :param version: Numerai dataset version.
        """
        self._check_dataset_version(version)
        live_files = [f"crypto/v{version}/{self.LIVE_DATASET_NAME}"]
        for file in live_files:
            dest_path = self._get_dest_path(subfolder, file)
            self.download_single_dataset(
                filename=file,
                dest_path=dest_path,
            )

    def download_single_dataset(self, filename: str, dest_path: str):
        """
        Download one of the available datasets through CryptoAPI.

        :param filename: Name as listed in CryptoAPI (Check CryptoAPI().list_datasets() for full overview)
        :param dest_path: Full path where file will be saved.
        """
        print(f"Downloading '{filename}'.")
        self.capi.download_dataset(
            filename=filename,
            dest_path=dest_path,
        )

    def _check_dataset_version(self, version: str):
        assert f"v{version}" in self.dataset_versions, f"Version '{version}' is not available in CryptoAPI."

`download_live_data(subfolder='', version='1.0')`

Download all live data in specified folder (i.e. minimal data needed for inference).

:param subfolder: Specify folder to create folder within directory root. Saves in directory root by default. :param version: Numerai dataset version.

Source code in src/numerblox/download.py

def download_live_data(
    self,
    subfolder: str = "",
    version: str = "1.0",
):
    """
    Download all live data in specified folder (i.e. minimal data needed for inference).

    :param subfolder: Specify folder to create folder within directory root.
    Saves in directory root by default.
    :param version: Numerai dataset version.
    """
    self._check_dataset_version(version)
    live_files = [f"crypto/v{version}/{self.LIVE_DATASET_NAME}"]
    for file in live_files:
        dest_path = self._get_dest_path(subfolder, file)
        self.download_single_dataset(
            filename=file,
            dest_path=dest_path,
        )

`download_single_dataset(filename, dest_path)`

Download one of the available datasets through CryptoAPI.

:param filename: Name as listed in CryptoAPI (Check CryptoAPI().list_datasets() for full overview) :param dest_path: Full path where file will be saved.

Source code in src/numerblox/download.py

def download_single_dataset(self, filename: str, dest_path: str):
    """
    Download one of the available datasets through CryptoAPI.

    :param filename: Name as listed in CryptoAPI (Check CryptoAPI().list_datasets() for full overview)
    :param dest_path: Full path where file will be saved.
    """
    print(f"Downloading '{filename}'.")
    self.capi.download_dataset(
        filename=filename,
        dest_path=dest_path,
    )

`download_training_data(subfolder='', version='1.0')`

Download all training data in specified folder for given version.

:param subfolder: Specify folder to create folder within directory root. Saves in directory root by default. :param version: Numerai dataset version.

Source code in src/numerblox/download.py

def download_training_data(
    self,
    subfolder: str = "",
    version: str = "1.0",
):
    """
    Download all training data in specified folder for given version.

    :param subfolder: Specify folder to create folder within directory root.
    Saves in directory root by default.
    :param version: Numerai dataset version.
    """
    self._check_dataset_version(version)
    training_files = [f"crypto/v{version}/{self.TRAIN_TARGETS_NAME}"]
    for file in training_files:
        dest_path = self._get_dest_path(subfolder, file)
        self.download_single_dataset(
            filename=file,
            dest_path=dest_path,
        )

`NumeraiSignalsDownloader`

Bases: BaseDownloader

Support for Numerai Signals data. More information: https://signals.numer.ai/data Downloading from SignalsAPI for Numerai Signals data.

:param directory_path: Base folder to download files to.

All kwargs will be passed to SignalsAPI initialization.

Source code in src/numerblox/download.py

class NumeraiSignalsDownloader(BaseDownloader):
    """
    Support for Numerai Signals data.
    More information: https://signals.numer.ai/data
    Downloading from SignalsAPI for Numerai Signals data. \n
    :param directory_path: Base folder to download files to. \n
    All kwargs will be passed to SignalsAPI initialization.
    """

    TRAIN_DATASET_NAME = "train.parquet"
    VALIDATION_DATASET_NAME = "validation.parquet"
    LIVE_DATASET_NAME = "live.parquet"
    LIVE_EXAMPLE_PREDS_NAME = "live_example_preds.parquet"
    VALIDATION_EXAMPLE_PREDS_NAME = "validation_example_preds.parquet"

    def __init__(self, directory_path: str, **kwargs):
        super().__init__(directory_path=directory_path)
        self.sapi = SignalsAPI(**kwargs)
        # Get all available versions available for Numerai Signals.
        self.dataset_versions = set(s.replace("signals/", "").split("/")[0] for s in self.sapi.list_datasets() if s.startswith("signals/v"))

    def download_training_data(self, subfolder: str = "", version: str = "2.0"):
        """
        Get Numerai Signals training and validation data.
        :param subfolder: Specify folder to create folder within base directory root.
        Saves in base directory root by default.
        :param version: Numerai Signals dataset version.
        """
        self._check_dataset_version(version)
        train_val_files = [f"signals/v{version}/{self.TRAIN_DATASET_NAME}", f"signals/v{version}/{self.VALIDATION_DATASET_NAME}"]
        for file in train_val_files:
            dest_path = self._get_dest_path(subfolder, file)
            self.download_single_dataset(filename=file, dest_path=dest_path)

    def download_single_dataset(self, filename: str, dest_path: str):
        """
        Download one of the available datasets through SignalsAPI.

        :param filename: Name as listed in SignalsAPI (Check SignalsAPI().list_datasets() for full overview)
        :param dest_path: Full path where file will be saved.
        """
        print(f"Downloading '{filename}'.")
        self.sapi.download_dataset(
            filename=filename,
            dest_path=dest_path,
        )

    def download_live_data(
        self,
        subfolder: str = "",
        version: str = "2.0",
    ):
        """
        Download all live data in specified folder (i.e. minimal data needed for inference).

        :param subfolder: Specify folder to create folder within directory root.
        Saves in directory root by default.
        :param version: Numerai dataset version.
        """
        self._check_dataset_version(version)
        live_files = [f"signals/v{version}/{self.LIVE_DATASET_NAME}"]
        for file in live_files:
            dest_path = self._get_dest_path(subfolder, file)
            self.download_single_dataset(
                filename=file,
                dest_path=dest_path,
            )

    def download_example_data(self, subfolder: str = "", version: str = "2.0"):
        """
        Download all example prediction data in specified folder for given version.

        :param subfolder: Specify folder to create folder within base directory root.
        Saves in base directory root by default.
        :param version: Numerai dataset version.
        """
        self._check_dataset_version(version)
        example_files = [f"signals/v{version}/{self.LIVE_EXAMPLE_PREDS_NAME}", f"signals/v{version}/{self.VALIDATION_EXAMPLE_PREDS_NAME}"]
        for file in example_files:
            dest_path = self._get_dest_path(subfolder, file)
            self.download_single_dataset(
                filename=file,
                dest_path=dest_path,
            )

    def _check_dataset_version(self, version: str):
        assert f"v{version}" in self.dataset_versions, f"Version '{version}' is not available in SignalsAPI."

`download_example_data(subfolder='', version='2.0')`

Download all example prediction data in specified folder for given version.

:param subfolder: Specify folder to create folder within base directory root. Saves in base directory root by default. :param version: Numerai dataset version.

Source code in src/numerblox/download.py

def download_example_data(self, subfolder: str = "", version: str = "2.0"):
    """
    Download all example prediction data in specified folder for given version.

    :param subfolder: Specify folder to create folder within base directory root.
    Saves in base directory root by default.
    :param version: Numerai dataset version.
    """
    self._check_dataset_version(version)
    example_files = [f"signals/v{version}/{self.LIVE_EXAMPLE_PREDS_NAME}", f"signals/v{version}/{self.VALIDATION_EXAMPLE_PREDS_NAME}"]
    for file in example_files:
        dest_path = self._get_dest_path(subfolder, file)
        self.download_single_dataset(
            filename=file,
            dest_path=dest_path,
        )

`download_live_data(subfolder='', version='2.0')`

Download all live data in specified folder (i.e. minimal data needed for inference).

:param subfolder: Specify folder to create folder within directory root. Saves in directory root by default. :param version: Numerai dataset version.

Source code in src/numerblox/download.py

def download_live_data(
    self,
    subfolder: str = "",
    version: str = "2.0",
):
    """
    Download all live data in specified folder (i.e. minimal data needed for inference).

    :param subfolder: Specify folder to create folder within directory root.
    Saves in directory root by default.
    :param version: Numerai dataset version.
    """
    self._check_dataset_version(version)
    live_files = [f"signals/v{version}/{self.LIVE_DATASET_NAME}"]
    for file in live_files:
        dest_path = self._get_dest_path(subfolder, file)
        self.download_single_dataset(
            filename=file,
            dest_path=dest_path,
        )

`download_single_dataset(filename, dest_path)`

Download one of the available datasets through SignalsAPI.

:param filename: Name as listed in SignalsAPI (Check SignalsAPI().list_datasets() for full overview) :param dest_path: Full path where file will be saved.

Source code in src/numerblox/download.py

def download_single_dataset(self, filename: str, dest_path: str):
    """
    Download one of the available datasets through SignalsAPI.

    :param filename: Name as listed in SignalsAPI (Check SignalsAPI().list_datasets() for full overview)
    :param dest_path: Full path where file will be saved.
    """
    print(f"Downloading '{filename}'.")
    self.sapi.download_dataset(
        filename=filename,
        dest_path=dest_path,
    )

`download_training_data(subfolder='', version='2.0')`

Get Numerai Signals training and validation data. :param subfolder: Specify folder to create folder within base directory root. Saves in base directory root by default. :param version: Numerai Signals dataset version.

Source code in src/numerblox/download.py

def download_training_data(self, subfolder: str = "", version: str = "2.0"):
    """
    Get Numerai Signals training and validation data.
    :param subfolder: Specify folder to create folder within base directory root.
    Saves in base directory root by default.
    :param version: Numerai Signals dataset version.
    """
    self._check_dataset_version(version)
    train_val_files = [f"signals/v{version}/{self.TRAIN_DATASET_NAME}", f"signals/v{version}/{self.VALIDATION_DATASET_NAME}"]
    for file in train_val_files:
        dest_path = self._get_dest_path(subfolder, file)
        self.download_single_dataset(filename=file, dest_path=dest_path)

NumerFrame

`NumerFrame`

Bases: DataFrame

Data structure which extends Pandas DataFrames and allows for additional Numerai specific functionality.

Source code in src/numerblox/numerframe.py

class NumerFrame(pd.DataFrame):
    """
    Data structure which extends Pandas DataFrames and
    allows for additional Numerai specific functionality.
    """

    _metadata = ["meta", "feature_cols", "target_cols", "prediction_cols", "not_aux_cols", "aux_cols"]

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.meta = AttrDict()
        self.__set_era_col()
        self.__init_meta_attrs()

    @property
    def _constructor(self):
        return NumerFrame

    def __init_meta_attrs(self):
        """Dynamically track column groups."""
        self.feature_cols = [col for col in self.columns if str(col).startswith("feature")]
        self.target_cols = [col for col in self.columns if str(col).startswith("target")]
        self.prediction_cols = [col for col in self.columns if str(col).startswith("prediction")]
        self.not_aux_cols = self.feature_cols + self.target_cols + self.prediction_cols
        self.aux_cols = [col for col in self.columns if col not in self.not_aux_cols]

    def __set_era_col(self):
        """Each NumerFrame should have an era column to benefit from all functionality."""
        if "era" in self.columns:
            self.meta.era_col = "era"
        elif "date" in self.columns:
            self.meta.era_col = "date"
        else:
            self.meta.era_col = None

    def get_column_selection(self, cols: Union[str, list]) -> "NumerFrame":
        """Return NumerFrame from selection of columns."""
        return self.loc[:, cols if isinstance(cols, list) else [cols]]

    @property
    def get_feature_data(self) -> "NumerFrame":
        """All columns for which name starts with 'target'."""
        return self.get_column_selection(cols=self.feature_cols)

    @property
    def get_target_data(self) -> "NumerFrame":
        """All columns for which name starts with 'target'."""
        return self.get_column_selection(cols=self.target_cols)

    @property
    def get_single_target_data(self) -> "NumerFrame":
        """Column with name 'target' (Main Numerai target column)."""
        return self.get_column_selection(cols=["target"])

    @property
    def get_prediction_data(self) -> "NumerFrame":
        """All columns for which name starts with 'prediction'."""
        return self.get_column_selection(cols=self.prediction_cols)

    @property
    def get_aux_data(self) -> "NumerFrame":
        """All columns that are not features, targets or predictions."""
        return self.get_column_selection(cols=self.aux_cols)

    @property
    def get_era_data(self) -> "NumerFrame":
        """Column of all eras."""
        return self.get_column_selection(cols=self.meta.era_col)

    @property
    def get_prediction_aux_data(self) -> "NumerFrame":
        """All predictions columns and aux columns (for ensembling, etc.)."""
        return self.get_column_selection(cols=self.prediction_cols + self.aux_cols)

    @property
    def get_fncv3_feature_data(self) -> "NumerFrame":
        """Get FNCv3 features."""
        return self.get_column_selection(cols=FNCV3_FEATURES)

    @property
    def get_small_feature_data(self) -> "NumerFrame":
        """Small subset of the Numerai dataset for v5 data."""
        return self.get_column_selection(cols=SMALL_FEATURES)

    @property
    def get_medium_feature_data(self) -> "NumerFrame":
        """Medium subset of the Numerai dataset for v5 data."""
        return self.get_column_selection(cols=MEDIUM_FEATURES)

    @property
    def get_unique_eras(self) -> list[str]:
        """Get all unique eras in the data."""
        return self[self.meta.era_col].unique().tolist()

    def get_last_n_eras(self, n: int) -> "NumerFrame":
        """
        Get data for the last n eras.
        Make sure eras are sorted in the way you prefer.
        :param n: Number of eras to select.
        :return: NumerFrame with last n eras.
        """
        eras = self[self.meta.era_col].unique()[-n:]
        return self.loc[self[self.meta.era_col].isin(eras)]

    def get_feature_group(self, group: str) -> "NumerFrame":
        """Get feature group based on name or list of names."""
        assert group in V5_FEATURE_GROUP_MAPPING.keys(), f"Group '{group}' not found in {V5_FEATURE_GROUP_MAPPING.keys()}"
        return self.get_column_selection(cols=V5_FEATURE_GROUP_MAPPING[group])

    def get_pattern_data(self, pattern: str) -> "NumerFrame":
        """
        Get columns based on pattern (for example '_20' to get all 20-day Numerai targets).
        :param pattern: A 'like' pattern (pattern in column_name == True)
        """
        return self.filter(like=pattern)

    def get_feature_target_pair(self, multi_target=False) -> tuple["NumerFrame", "NumerFrame"]:
        """
        Get split of feature and target columns.
        :param multi_target: Returns only 'target' column by default.
        Returns all target columns when set to True.
        """
        X = self.get_feature_data
        y = self.get_target_data if multi_target else self.get_single_target_data
        return X, y

    def get_era_batch(self, eras: list[Any], aemlp_batch=False, features: list = None, targets: list = None) -> tuple["NumerFrame", "NumerFrame"]:
        """
        Get feature target pair batch of 1 or multiple eras. \n
        :param eras: Selection of era names that should be present in era_col. \n
        :param aemlp_batch: Specific target batch for autoencoder training. \n
        `y` output will contain three components: features, targets and targets. \n
        :param features: List of features to select. All by default \n
        :param targets: List of targets to select. All by default. \n
        """
        valid_eras = []
        for era in eras:
            assert era in self[self.meta.era_col].unique(), f"Era '{era}' not found in era column ({self.meta.era_col})"
            valid_eras.append(era)
        features = features if features else self.feature_cols
        targets = targets if targets else self.target_cols
        X = self.loc[self[self.meta.era_col].isin(valid_eras)][features].values
        y = self.loc[self[self.meta.era_col].isin(valid_eras)][targets].values
        if aemlp_batch:
            y = [X.copy(), y.copy(), y.copy()]
        return X, y

    @property
    def get_dates_from_era_col(self) -> pd.Series:
        """Column of all dates from era column."""
        assert "era" in self.columns, "No 'era' column in NumerFrame. Please make sure to have a valid 'era' column to use for converting to dates."
        return self["era"].astype(int).apply(self.get_date_from_era)

    @property
    def get_eras_from_date_col(self) -> pd.Series:
        """Column of all eras from date column."""
        assert "date" in self.columns, "No 'date' column in NumerFrame. Please make sure to have a valid 'date' column."
        return self["date"].apply(self.get_era_from_date)

    def get_era_range(self, start_era: int, end_era: int) -> "NumerFrame":
        """
        Get all eras between two era numbers.
        :param start_era: Era number to start from (inclusive).
        :param end_era: Era number to end with (inclusive).
        :return: NumerFrame with all eras between start_era and end_era.
        """
        assert "era" in self.columns, "No 'era' column in NumerFrame. Please make sure to have an 'era' column."
        assert isinstance(start_era, int), f"start_era should be of type 'int' but is '{type(start_era)}'"
        assert isinstance(end_era, int), f"end_era should be of type 'int' but is '{type(end_era)}'"
        assert 1 <= start_era <= end_era <= get_current_era(), f"start_era should be between 1 and {get_current_era()}. Got '{start_era}'."
        assert 1 <= start_era <= end_era <= get_current_era(), f"end_era should be between 1 and {get_current_era()}. Got '{end_era}'."
        assert start_era <= end_era, f"start_era should be before end_era. Got '{start_era}' and '{end_era}'"

        temp_df = self.copy()
        temp_df["era_int"] = temp_df["era"].astype(int)
        result_df = temp_df[(temp_df["era_int"] >= start_era) & (temp_df["era_int"] <= end_era)]
        return result_df.drop(columns=["era_int"])

    def get_date_range(self, start_date: pd.Timestamp, end_date: pd.Timestamp) -> "NumerFrame":
        """
        Get all eras between two dates.
        :param start_date: Starting date (inclusive).
        :param end_date: Ending date (inclusive).
        :return: NumerFrame with all eras between start_date and end_date.
        """
        assert "date" in self.columns, "No 'date' column in NumerFrame. Please make sure to have a valid 'date' column."
        assert isinstance(start_date, pd.Timestamp), f"start_date should be of type 'pd.Timestamp' but is '{type(start_date)}'"
        assert isinstance(end_date, pd.Timestamp), f"end_date should be of type 'pd.Timestamp' but is '{type(end_date)}'"
        assert ERA1_TIMESTAMP <= start_date <= pd.Timestamp(get_current_date()), f"start_date should be between {ERA_ONE_START} and {pd.Timestamp(get_current_date())}"
        assert ERA1_TIMESTAMP <= end_date <= pd.Timestamp(get_current_date()), f"end_date should be between {ERA_ONE_START} and {pd.Timestamp(get_current_date())}"
        assert start_date <= end_date, f"start_date should be before end_date. Got '{start_date}' and '{end_date}'"

        temp_df = self.copy()
        result_df = temp_df[(temp_df["date"] >= start_date) & (temp_df["date"] <= end_date)]
        return result_df

    @staticmethod
    def get_era_from_date(date_object: pd.Timestamp) -> int:
        """
        Get the era number from a specific date.
        :param date_object: Pandas Timestamp object for which to get era.
        :return: Era number.
        """
        assert isinstance(date_object, pd.Timestamp), f"date_object should be of type 'date' but is '{type(date_object)}'"
        current_date = pd.Timestamp(get_current_date())
        assert ERA1_TIMESTAMP <= date_object <= current_date, f"date_object should be between {ERA_ONE_START} and {current_date}"
        return get_era_for_date(date_object.date())

    @staticmethod
    def get_date_from_era(era: int) -> pd.Timestamp:
        """
        Get the date from a specific era.
        :param era: Era number for which to get date.
        Should be an integer which is at least 1.
        :return: Datetime object representing the date of the given era.
        """
        assert isinstance(era, int), f"era should be of type 'int' but is '{type(era)}'"
        assert 1 <= era <= get_current_era(), f"era should be between 1 and {get_current_era()}. Got '{era}'."
        return pd.Timestamp(get_date_for_era(era))

`get_aux_data` `property`

All columns that are not features, targets or predictions.

`get_dates_from_era_col` `property`

Column of all dates from era column.

`get_era_data` `property`

Column of all eras.

`get_eras_from_date_col` `property`

Column of all eras from date column.

`get_feature_data` `property`

All columns for which name starts with 'target'.

`get_fncv3_feature_data` `property`

Get FNCv3 features.

`get_medium_feature_data` `property`

Medium subset of the Numerai dataset for v5 data.

`get_prediction_aux_data` `property`

All predictions columns and aux columns (for ensembling, etc.).

`get_prediction_data` `property`

All columns for which name starts with 'prediction'.

`get_single_target_data` `property`

Column with name 'target' (Main Numerai target column).

`get_small_feature_data` `property`

Small subset of the Numerai dataset for v5 data.

`get_target_data` `property`

All columns for which name starts with 'target'.

`get_unique_eras` `property`

Get all unique eras in the data.

`__init_meta_attrs()`

Dynamically track column groups.

Source code in src/numerblox/numerframe.py

def __init_meta_attrs(self):
    """Dynamically track column groups."""
    self.feature_cols = [col for col in self.columns if str(col).startswith("feature")]
    self.target_cols = [col for col in self.columns if str(col).startswith("target")]
    self.prediction_cols = [col for col in self.columns if str(col).startswith("prediction")]
    self.not_aux_cols = self.feature_cols + self.target_cols + self.prediction_cols
    self.aux_cols = [col for col in self.columns if col not in self.not_aux_cols]

`__set_era_col()`

Each NumerFrame should have an era column to benefit from all functionality.

Source code in src/numerblox/numerframe.py

def __set_era_col(self):
    """Each NumerFrame should have an era column to benefit from all functionality."""
    if "era" in self.columns:
        self.meta.era_col = "era"
    elif "date" in self.columns:
        self.meta.era_col = "date"
    else:
        self.meta.era_col = None

`get_column_selection(cols)`

Return NumerFrame from selection of columns.

Source code in src/numerblox/numerframe.py

def get_column_selection(self, cols: Union[str, list]) -> "NumerFrame":
    """Return NumerFrame from selection of columns."""
    return self.loc[:, cols if isinstance(cols, list) else [cols]]

`get_date_from_era(era)` `staticmethod`

Get the date from a specific era. :param era: Era number for which to get date. Should be an integer which is at least 1. :return: Datetime object representing the date of the given era.

Source code in src/numerblox/numerframe.py

@staticmethod
def get_date_from_era(era: int) -> pd.Timestamp:
    """
    Get the date from a specific era.
    :param era: Era number for which to get date.
    Should be an integer which is at least 1.
    :return: Datetime object representing the date of the given era.
    """
    assert isinstance(era, int), f"era should be of type 'int' but is '{type(era)}'"
    assert 1 <= era <= get_current_era(), f"era should be between 1 and {get_current_era()}. Got '{era}'."
    return pd.Timestamp(get_date_for_era(era))

`get_date_range(start_date, end_date)`

Get all eras between two dates. :param start_date: Starting date (inclusive). :param end_date: Ending date (inclusive). :return: NumerFrame with all eras between start_date and end_date.

Source code in src/numerblox/numerframe.py

def get_date_range(self, start_date: pd.Timestamp, end_date: pd.Timestamp) -> "NumerFrame":
    """
    Get all eras between two dates.
    :param start_date: Starting date (inclusive).
    :param end_date: Ending date (inclusive).
    :return: NumerFrame with all eras between start_date and end_date.
    """
    assert "date" in self.columns, "No 'date' column in NumerFrame. Please make sure to have a valid 'date' column."
    assert isinstance(start_date, pd.Timestamp), f"start_date should be of type 'pd.Timestamp' but is '{type(start_date)}'"
    assert isinstance(end_date, pd.Timestamp), f"end_date should be of type 'pd.Timestamp' but is '{type(end_date)}'"
    assert ERA1_TIMESTAMP <= start_date <= pd.Timestamp(get_current_date()), f"start_date should be between {ERA_ONE_START} and {pd.Timestamp(get_current_date())}"
    assert ERA1_TIMESTAMP <= end_date <= pd.Timestamp(get_current_date()), f"end_date should be between {ERA_ONE_START} and {pd.Timestamp(get_current_date())}"
    assert start_date <= end_date, f"start_date should be before end_date. Got '{start_date}' and '{end_date}'"

    temp_df = self.copy()
    result_df = temp_df[(temp_df["date"] >= start_date) & (temp_df["date"] <= end_date)]
    return result_df

`get_era_batch(eras, aemlp_batch=False, features=None, targets=None)`

Get feature target pair batch of 1 or multiple eras.

:param eras: Selection of era names that should be present in era_col.

:param aemlp_batch: Specific target batch for autoencoder training.

y output will contain three components: features, targets and targets.

:param features: List of features to select. All by default

:param targets: List of targets to select. All by default.

Source code in src/numerblox/numerframe.py

def get_era_batch(self, eras: list[Any], aemlp_batch=False, features: list = None, targets: list = None) -> tuple["NumerFrame", "NumerFrame"]:
    """
    Get feature target pair batch of 1 or multiple eras. \n
    :param eras: Selection of era names that should be present in era_col. \n
    :param aemlp_batch: Specific target batch for autoencoder training. \n
    `y` output will contain three components: features, targets and targets. \n
    :param features: List of features to select. All by default \n
    :param targets: List of targets to select. All by default. \n
    """
    valid_eras = []
    for era in eras:
        assert era in self[self.meta.era_col].unique(), f"Era '{era}' not found in era column ({self.meta.era_col})"
        valid_eras.append(era)
    features = features if features else self.feature_cols
    targets = targets if targets else self.target_cols
    X = self.loc[self[self.meta.era_col].isin(valid_eras)][features].values
    y = self.loc[self[self.meta.era_col].isin(valid_eras)][targets].values
    if aemlp_batch:
        y = [X.copy(), y.copy(), y.copy()]
    return X, y

`get_era_from_date(date_object)` `staticmethod`

Get the era number from a specific date. :param date_object: Pandas Timestamp object for which to get era. :return: Era number.

Source code in src/numerblox/numerframe.py

@staticmethod
def get_era_from_date(date_object: pd.Timestamp) -> int:
    """
    Get the era number from a specific date.
    :param date_object: Pandas Timestamp object for which to get era.
    :return: Era number.
    """
    assert isinstance(date_object, pd.Timestamp), f"date_object should be of type 'date' but is '{type(date_object)}'"
    current_date = pd.Timestamp(get_current_date())
    assert ERA1_TIMESTAMP <= date_object <= current_date, f"date_object should be between {ERA_ONE_START} and {current_date}"
    return get_era_for_date(date_object.date())

`get_era_range(start_era, end_era)`

Get all eras between two era numbers. :param start_era: Era number to start from (inclusive). :param end_era: Era number to end with (inclusive). :return: NumerFrame with all eras between start_era and end_era.

Source code in src/numerblox/numerframe.py

def get_era_range(self, start_era: int, end_era: int) -> "NumerFrame":
    """
    Get all eras between two era numbers.
    :param start_era: Era number to start from (inclusive).
    :param end_era: Era number to end with (inclusive).
    :return: NumerFrame with all eras between start_era and end_era.
    """
    assert "era" in self.columns, "No 'era' column in NumerFrame. Please make sure to have an 'era' column."
    assert isinstance(start_era, int), f"start_era should be of type 'int' but is '{type(start_era)}'"
    assert isinstance(end_era, int), f"end_era should be of type 'int' but is '{type(end_era)}'"
    assert 1 <= start_era <= end_era <= get_current_era(), f"start_era should be between 1 and {get_current_era()}. Got '{start_era}'."
    assert 1 <= start_era <= end_era <= get_current_era(), f"end_era should be between 1 and {get_current_era()}. Got '{end_era}'."
    assert start_era <= end_era, f"start_era should be before end_era. Got '{start_era}' and '{end_era}'"

    temp_df = self.copy()
    temp_df["era_int"] = temp_df["era"].astype(int)
    result_df = temp_df[(temp_df["era_int"] >= start_era) & (temp_df["era_int"] <= end_era)]
    return result_df.drop(columns=["era_int"])

`get_feature_group(group)`

Get feature group based on name or list of names.

Source code in src/numerblox/numerframe.py

def get_feature_group(self, group: str) -> "NumerFrame":
    """Get feature group based on name or list of names."""
    assert group in V5_FEATURE_GROUP_MAPPING.keys(), f"Group '{group}' not found in {V5_FEATURE_GROUP_MAPPING.keys()}"
    return self.get_column_selection(cols=V5_FEATURE_GROUP_MAPPING[group])

`get_feature_target_pair(multi_target=False)`

Get split of feature and target columns. :param multi_target: Returns only 'target' column by default. Returns all target columns when set to True.

Source code in src/numerblox/numerframe.py

def get_feature_target_pair(self, multi_target=False) -> tuple["NumerFrame", "NumerFrame"]:
    """
    Get split of feature and target columns.
    :param multi_target: Returns only 'target' column by default.
    Returns all target columns when set to True.
    """
    X = self.get_feature_data
    y = self.get_target_data if multi_target else self.get_single_target_data
    return X, y

`get_last_n_eras(n)`

Get data for the last n eras. Make sure eras are sorted in the way you prefer. :param n: Number of eras to select. :return: NumerFrame with last n eras.

Source code in src/numerblox/numerframe.py

def get_last_n_eras(self, n: int) -> "NumerFrame":
    """
    Get data for the last n eras.
    Make sure eras are sorted in the way you prefer.
    :param n: Number of eras to select.
    :return: NumerFrame with last n eras.
    """
    eras = self[self.meta.era_col].unique()[-n:]
    return self.loc[self[self.meta.era_col].isin(eras)]

`get_pattern_data(pattern)`

Get columns based on pattern (for example '_20' to get all 20-day Numerai targets). :param pattern: A 'like' pattern (pattern in column_name == True)

Source code in src/numerblox/numerframe.py

def get_pattern_data(self, pattern: str) -> "NumerFrame":
    """
    Get columns based on pattern (for example '_20' to get all 20-day Numerai targets).
    :param pattern: A 'like' pattern (pattern in column_name == True)
    """
    return self.filter(like=pattern)

`create_numerframe(file_path, columns=None, *args, **kwargs)`

Convenient function to initialize NumerFrame. Support most used file formats for Pandas DataFrames

(.csv, .parquet, .xls, .pkl, etc.). For more details check https://pandas.pydata.org/docs/reference/io.html

:param file_path: Relative or absolute path to data file.

:param columns: Which columns to read (All by default).

args, *kwargs will be passed to Pandas loading function.

Source code in src/numerblox/numerframe.py

def create_numerframe(file_path: str, columns: list = None, *args, **kwargs) -> NumerFrame:
    """
    Convenient function to initialize NumerFrame.
    Support most used file formats for Pandas DataFrames \n
    (.csv, .parquet, .xls, .pkl, etc.).
    For more details check https://pandas.pydata.org/docs/reference/io.html

    :param file_path: Relative or absolute path to data file. \n
    :param columns: Which columns to read (All by default). \n
    *args, **kwargs will be passed to Pandas loading function.
    """
    assert Path(file_path).is_file(), f"{file_path} does not point to file."
    suffix = Path(file_path).suffix
    if suffix in [".csv"]:
        df = pd.read_csv(file_path, usecols=columns, *args, **kwargs)
    elif suffix in [".parquet"]:
        df = pd.read_parquet(file_path, columns=columns, *args, **kwargs)
    elif suffix in [".xls", ".xlsx", ".xlsm", "xlsb", ".odf", ".ods", ".odt"]:
        df = pd.read_excel(file_path, usecols=columns, *args, **kwargs)
    elif suffix in [".pkl", ".pickle"]:
        df = pd.read_pickle(file_path, *args, **kwargs)
        df = df.loc[:, columns] if columns else df
    else:
        raise NotImplementedError(f"Suffix '{suffix}' is not supported.")
    num_frame = NumerFrame(df)
    return num_frame

Preprocessing

Base Preprocessing

`BasePreProcessor`

Bases: TransformerMixin, BaseEstimator

Common functionality for preprocessors and postprocessors.

Source code in src/numerblox/preprocessing/base.py

class BasePreProcessor(TransformerMixin, BaseEstimator):
    """Common functionality for preprocessors and postprocessors."""

    def __init__(self):
        sklearn.set_config(enable_metadata_routing=True)

    def fit(self, X, y=None):
        self.is_fitted_ = True
        return self

    @abstractmethod
    def transform(self, X: Union[np.array, pd.DataFrame], y=None, **kwargs) -> pd.DataFrame: ...

    @abstractmethod
    def get_feature_names_out(self, input_features=None) -> List[str]: ...

Classic Preprocessing

`GroupStatsPreProcessor`

Bases: BasePreProcessor

Note that this class only works with pd.DataFrame input. When using in a Pipeline, make sure that the Pandas output API is set (.set_output(transform="pandas").

Calculates group statistics for all data groups.

:param groups: Groups to create features for. All groups by default.

Source code in src/numerblox/preprocessing/classic.py

class GroupStatsPreProcessor(BasePreProcessor):
    """
    Note that this class only works with `pd.DataFrame` input.
    When using in a Pipeline, make sure that the Pandas output API is set (`.set_output(transform="pandas")`.

    Calculates group statistics for all data groups. \n
    :param groups: Groups to create features for. All groups by default. \n
    """

    def __init__(self, groups: list = None):
        super().__init__()
        self.all_groups = ["intelligence", "charisma", "strength", "dexterity", "constitution", "wisdom", "agility", "serenity", "sunshine", "rain"]
        self.groups = groups
        self.group_names = groups if self.groups else self.all_groups
        self.feature_group_mapping = V5_FEATURE_GROUP_MAPPING

    def transform(self, X: pd.DataFrame) -> np.array:
        """Check validity and add group features."""
        dataf = self._add_group_features(X)
        return dataf.to_numpy()

    def _add_group_features(self, X: pd.DataFrame) -> pd.DataFrame:
        """Mean, standard deviation and skew for each group."""
        dataf = pd.DataFrame()
        for group in self.group_names:
            cols = self.feature_group_mapping[group]
            valid_cols = [col for col in cols if col in X.columns]
            if not valid_cols:
                warnings.warn(f"None of the columns of '{group}' are in the input data. Output will be nans for the group features.")
            elif len(cols) != len(valid_cols):
                warnings.warn(f"Not all columns of '{group}' are in the input data ({len(valid_cols)} < {len(cols)}). Use remaining columns for stats features.")
            dataf.loc[:, f"feature_{group}_mean"] = X[valid_cols].mean(axis=1)
            dataf.loc[:, f"feature_{group}_std"] = X[valid_cols].std(axis=1)
            dataf.loc[:, f"feature_{group}_skew"] = X[valid_cols].skew(axis=1)
        return dataf

    def get_feature_names_out(self, input_features=None) -> List[str]:
        """Return feature names."""
        if not input_features:
            feature_names = []
            for group in self.group_names:
                feature_names.append(f"feature_{group}_mean")
                feature_names.append(f"feature_{group}_std")
                feature_names.append(f"feature_{group}_skew")
        else:
            feature_names = input_features
        return feature_names

`_add_group_features(X)`

Mean, standard deviation and skew for each group.

Source code in src/numerblox/preprocessing/classic.py

def _add_group_features(self, X: pd.DataFrame) -> pd.DataFrame:
    """Mean, standard deviation and skew for each group."""
    dataf = pd.DataFrame()
    for group in self.group_names:
        cols = self.feature_group_mapping[group]
        valid_cols = [col for col in cols if col in X.columns]
        if not valid_cols:
            warnings.warn(f"None of the columns of '{group}' are in the input data. Output will be nans for the group features.")
        elif len(cols) != len(valid_cols):
            warnings.warn(f"Not all columns of '{group}' are in the input data ({len(valid_cols)} < {len(cols)}). Use remaining columns for stats features.")
        dataf.loc[:, f"feature_{group}_mean"] = X[valid_cols].mean(axis=1)
        dataf.loc[:, f"feature_{group}_std"] = X[valid_cols].std(axis=1)
        dataf.loc[:, f"feature_{group}_skew"] = X[valid_cols].skew(axis=1)
    return dataf

`get_feature_names_out(input_features=None)`

Return feature names.

Source code in src/numerblox/preprocessing/classic.py

def get_feature_names_out(self, input_features=None) -> List[str]:
    """Return feature names."""
    if not input_features:
        feature_names = []
        for group in self.group_names:
            feature_names.append(f"feature_{group}_mean")
            feature_names.append(f"feature_{group}_std")
            feature_names.append(f"feature_{group}_skew")
    else:
        feature_names = input_features
    return feature_names

`transform(X)`

Check validity and add group features.

Source code in src/numerblox/preprocessing/classic.py

def transform(self, X: pd.DataFrame) -> np.array:
    """Check validity and add group features."""
    dataf = self._add_group_features(X)
    return dataf.to_numpy()

Signals Preprocessing

`DifferencePreProcessor`

Bases: BasePreProcessor

Add difference features based on given windows. Run LagPreProcessor first. Usage in Pipeline works only with Pandas API. Run .set_output("pandas") on your pipeline first.

:param windows: All lag windows to process for all features.

:param feature_names: All features for which you want to create differences. All features that also have lags by default.

:param pct_change: Method to calculate differences. If True, will calculate differences with a percentage change. Otherwise calculates a simple difference. Defaults to False

:param abs_diff: Whether to also calculate the absolute value of all differences. Defaults to True

Source code in src/numerblox/preprocessing/signals.py

class DifferencePreProcessor(BasePreProcessor):
    """
    Add difference features based on given windows. Run LagPreProcessor first.
    Usage in Pipeline works only with Pandas API.
    Run `.set_output("pandas")` on your pipeline first.

    :param windows: All lag windows to process for all features. \n
    :param feature_names: All features for which you want to create differences. All features that also have lags by default. \n
    :param pct_change: Method to calculate differences. If True, will calculate differences with a percentage change. Otherwise calculates a simple difference. Defaults to False \n
    :param abs_diff: Whether to also calculate the absolute value of all differences. Defaults to True \n
    """

    def __init__(
        self,
        windows: list = None,
        pct_diff: bool = False,
        abs_diff: bool = False,
    ):
        super().__init__()
        self.windows = windows if windows else [5, 10, 15, 20]
        self.pct_diff = pct_diff
        self.abs_diff = abs_diff

    def transform(self, X: pd.DataFrame) -> np.array:
        """
        Create difference feature from lag features.
        :param X: DataFrame with lag features.
        NOTE: Make sure only lag features are present in the DataFrame.
        """
        feature_names = X.columns.tolist()
        for col in feature_names:
            assert "_lag" in col, "DifferencePreProcessor expects only lag features. Got feature: '{col}'"
        output_features = []
        for feature in tqdm(feature_names, desc="Difference feature generation"):
            for day in self.windows:
                differenced_values = (X[feature] / X[feature]) - 1 if self.pct_diff else X[feature] - X[feature]
                X.loc[:, f"{feature}_diff{day}"] = differenced_values
                output_features.append(f"{feature}_diff{day}")
                if self.abs_diff:
                    X.loc[:, f"{feature}_absdiff{day}"] = np.abs(X[f"{feature}_diff{day}"])
                    output_features.append(f"{feature}_absdiff{day}")
        self.output_features = output_features
        return X[self.output_features].to_numpy()

    def get_feature_names_out(self, input_features=None) -> List[str]:
        return self.output_features if not input_features else input_features

`transform(X)`

Create difference feature from lag features. :param X: DataFrame with lag features. NOTE: Make sure only lag features are present in the DataFrame.

Source code in src/numerblox/preprocessing/signals.py

def transform(self, X: pd.DataFrame) -> np.array:
    """
    Create difference feature from lag features.
    :param X: DataFrame with lag features.
    NOTE: Make sure only lag features are present in the DataFrame.
    """
    feature_names = X.columns.tolist()
    for col in feature_names:
        assert "_lag" in col, "DifferencePreProcessor expects only lag features. Got feature: '{col}'"
    output_features = []
    for feature in tqdm(feature_names, desc="Difference feature generation"):
        for day in self.windows:
            differenced_values = (X[feature] / X[feature]) - 1 if self.pct_diff else X[feature] - X[feature]
            X.loc[:, f"{feature}_diff{day}"] = differenced_values
            output_features.append(f"{feature}_diff{day}")
            if self.abs_diff:
                X.loc[:, f"{feature}_absdiff{day}"] = np.abs(X[f"{feature}_diff{day}"])
                output_features.append(f"{feature}_absdiff{day}")
    self.output_features = output_features
    return X[self.output_features].to_numpy()

`EraQuantileProcessor`

Bases: BasePreProcessor

Transform features into quantiles by era. :param num_quantiles: Number of quantiles to use for quantile transformation. :param random_state: Random state for QuantileTransformer. :param cpu_cores: Number of CPU cores to use for parallel processing.

Source code in src/numerblox/preprocessing/signals.py

class EraQuantileProcessor(BasePreProcessor):
    """
    Transform features into quantiles by era.
    :param num_quantiles: Number of quantiles to use for quantile transformation.
    :param random_state: Random state for QuantileTransformer.
    :param cpu_cores: Number of CPU cores to use for parallel processing.
    """

    def __init__(
        self,
        num_quantiles: int = 50,
        random_state: int = 0,
        cpu_cores: int = -1,
    ):
        super().__init__()
        self.num_quantiles = num_quantiles
        self.random_state = random_state
        self.cpu_cores = cpu_cores
        self.quantiler = QuantileTransformer(n_quantiles=self.num_quantiles, random_state=self.random_state)
        # Metadata routing
        self.set_transform_request(era_series=True)

    def _quantile_transform(self, group_data: pd.Series) -> pd.Series:
        """
        Process single feature for a single era.
        :param group_data: Data for a single feature and era.
        :return: Quantile transformed data.
        """
        transformed_data = self.quantiler.fit_transform(group_data.to_frame()).ravel()
        return pd.Series(transformed_data, index=group_data.index)

    def transform(
        self,
        X: Union[np.array, pd.DataFrame],
        era_series: pd.Series = None,
    ) -> np.array:
        """
        Quantile all features by era.
        :param X: Array or DataFrame containing features to be quantiled.
        :param era_series: Series containing era information.
        :return: Quantiled features.
        """
        X = pd.DataFrame(X)
        if era_series is None:
            warnings.warn("""WARNING: 'era_series' not provided for 
                          EraQuantileProcessor! Quantiling will be treated as if
                          'X' is 1 era of data. Ensure you are not passing  multiple
                          eras to EraQuantileProcessor in this way! Not providing
                          'era_series' is valid for live inference, where only one
                          era is used for quantiling.""")
        else:
            assert X.shape[0] == era_series.shape[0], "Input X and era_series must have the same number of rows for quantiling."
        self.features = [col for col in X.columns]
        X.loc[:, "era"] = era_series if era_series is not None else "X"
        date_groups = X.groupby("era", group_keys=False)

        def process_feature(feature):
            group_data = date_groups[feature].apply(lambda x: self._quantile_transform(x))
            return pd.Series(group_data, name=f"{feature}_quantile{self.num_quantiles}")

        output_series_list = Parallel(n_jobs=self.cpu_cores)(delayed(process_feature)(feature) for feature in tqdm(self.features, desc=f"Quantiling {len(self.features)} features"))
        output_df = pd.concat(output_series_list, axis=1)
        return output_df.to_numpy()

    def fit_transform(self, X: Union[np.array, pd.DataFrame], era_series: pd.Series):
        self.fit(X=X)
        return self.transform(X=X, era_series=era_series)

    def get_feature_names_out(self, input_features=None) -> List[str]:
        """Return feature names."""
        if not input_features:
            feature_names = []
            for feature in self.features:
                feature_names.append(f"{feature}_quantile{self.num_quantiles}")
        else:
            feature_names = input_features
        return feature_names

`_quantile_transform(group_data)`

Process single feature for a single era. :param group_data: Data for a single feature and era. :return: Quantile transformed data.

Source code in src/numerblox/preprocessing/signals.py

def _quantile_transform(self, group_data: pd.Series) -> pd.Series:
    """
    Process single feature for a single era.
    :param group_data: Data for a single feature and era.
    :return: Quantile transformed data.
    """
    transformed_data = self.quantiler.fit_transform(group_data.to_frame()).ravel()
    return pd.Series(transformed_data, index=group_data.index)

`get_feature_names_out(input_features=None)`

Return feature names.

Source code in src/numerblox/preprocessing/signals.py

def get_feature_names_out(self, input_features=None) -> List[str]:
    """Return feature names."""
    if not input_features:
        feature_names = []
        for feature in self.features:
            feature_names.append(f"{feature}_quantile{self.num_quantiles}")
    else:
        feature_names = input_features
    return feature_names

`transform(X, era_series=None)`

Quantile all features by era. :param X: Array or DataFrame containing features to be quantiled. :param era_series: Series containing era information. :return: Quantiled features.

Source code in src/numerblox/preprocessing/signals.py

def transform(
    self,
    X: Union[np.array, pd.DataFrame],
    era_series: pd.Series = None,
) -> np.array:
    """
    Quantile all features by era.
    :param X: Array or DataFrame containing features to be quantiled.
    :param era_series: Series containing era information.
    :return: Quantiled features.
    """
    X = pd.DataFrame(X)
    if era_series is None:
        warnings.warn("""WARNING: 'era_series' not provided for 
                      EraQuantileProcessor! Quantiling will be treated as if
                      'X' is 1 era of data. Ensure you are not passing  multiple
                      eras to EraQuantileProcessor in this way! Not providing
                      'era_series' is valid for live inference, where only one
                      era is used for quantiling.""")
    else:
        assert X.shape[0] == era_series.shape[0], "Input X and era_series must have the same number of rows for quantiling."
    self.features = [col for col in X.columns]
    X.loc[:, "era"] = era_series if era_series is not None else "X"
    date_groups = X.groupby("era", group_keys=False)

    def process_feature(feature):
        group_data = date_groups[feature].apply(lambda x: self._quantile_transform(x))
        return pd.Series(group_data, name=f"{feature}_quantile{self.num_quantiles}")

    output_series_list = Parallel(n_jobs=self.cpu_cores)(delayed(process_feature)(feature) for feature in tqdm(self.features, desc=f"Quantiling {len(self.features)} features"))
    output_df = pd.concat(output_series_list, axis=1)
    return output_df.to_numpy()

`HLOCVAdjuster`

Bases: BasePreProcessor

Adjust HLOCV data for splits and dividends based on ratio of unadjusted and adjusted close prices. NOTE: This step only works with DataFrame input. Usage in intermediate steps of a scikit-learn Pipeline works with the Pandas set_output API. i.e. pipeline.set_output(transform="pandas").

Source code in src/numerblox/preprocessing/signals.py

class HLOCVAdjuster(BasePreProcessor):
    """
    Adjust HLOCV data for splits and dividends based on ratio of unadjusted and adjusted close prices.
    NOTE: This step only works with DataFrame input.
    Usage in intermediate steps of a scikit-learn Pipeline works with the Pandas set_output API.
    i.e. pipeline.set_output(transform="pandas").
    """

    def __init__(self, open_col="open", high_col="high", low_col="low", close_col="close", volume_col="volume", adj_close_col="adjusted_close"):
        super().__init__()
        self.open_col = open_col
        self.high_col = high_col
        self.low_col = low_col
        self.close_col = close_col
        self.volume_col = volume_col
        self.adj_close_col = adj_close_col
        self.adjusted_col_names = [f"adjusted_{self.high_col}", f"adjusted_{self.low_col}", f"adjusted_{self.open_col}", self.adj_close_col, f"adjusted_{self.volume_col}"]

    def fit(self, X: pd.DataFrame, y=None):
        self.ratio_ = X[self.close_col] / X[self.adj_close_col]
        self.is_fitted_ = True
        return self

    def transform(self, X: pd.DataFrame) -> np.array:
        """
        Adjust open, high, low, close and volume for splits and dividends.
        :param X: DataFrame with columns: [high, low, open, close, volume] (HLOCV)
        :return: Array with adjusted HLOCV columns
        """
        X_copy = X.copy()
        X_copy[f"adjusted_{self.high_col}"] = X[self.high_col] / self.ratio_
        X_copy[f"adjusted_{self.low_col}"] = X[self.low_col] / self.ratio_
        X_copy[f"adjusted_{self.open_col}"] = X[self.open_col] / self.ratio_
        X_copy[f"adjusted_{self.volume_col}"] = X[self.volume_col] * self.ratio_
        return X_copy[self.adjusted_col_names].to_numpy()

    def get_feature_names_out(self, input_features=None) -> List[str]:
        return self.adjusted_col_names

`transform(X)`

Adjust open, high, low, close and volume for splits and dividends. :param X: DataFrame with columns: [high, low, open, close, volume] (HLOCV) :return: Array with adjusted HLOCV columns

Source code in src/numerblox/preprocessing/signals.py

def transform(self, X: pd.DataFrame) -> np.array:
    """
    Adjust open, high, low, close and volume for splits and dividends.
    :param X: DataFrame with columns: [high, low, open, close, volume] (HLOCV)
    :return: Array with adjusted HLOCV columns
    """
    X_copy = X.copy()
    X_copy[f"adjusted_{self.high_col}"] = X[self.high_col] / self.ratio_
    X_copy[f"adjusted_{self.low_col}"] = X[self.low_col] / self.ratio_
    X_copy[f"adjusted_{self.open_col}"] = X[self.open_col] / self.ratio_
    X_copy[f"adjusted_{self.volume_col}"] = X[self.volume_col] * self.ratio_
    return X_copy[self.adjusted_col_names].to_numpy()

`KatsuFeatureGenerator`

Bases: BasePreProcessor

Effective feature engineering setup based on Katsu's starter notebook. Based on source by Katsu1110: https://www.kaggle.com/code1110/numeraisignals-starter-for-beginners

:param windows: Time interval to apply for window features:

Percentage Rate of change
Volatility
Moving Average gap

:param ticker_col: Columns with tickers to iterate over.

:param close_col: Column name where you have closing price stored. :param num_cores: Number of cores to use for multiprocessing.

:param verbose: Print additional information.

Source code in src/numerblox/preprocessing/signals.py

class KatsuFeatureGenerator(BasePreProcessor):
    """
    Effective feature engineering setup based on Katsu's starter notebook.
    Based on source by Katsu1110: https://www.kaggle.com/code1110/numeraisignals-starter-for-beginners

    :param windows: Time interval to apply for window features: \n
    1. Percentage Rate of change \n
    2. Volatility \n
    3. Moving Average gap \n
    :param ticker_col: Columns with tickers to iterate over. \n
    :param close_col: Column name where you have closing price stored.
    :param num_cores: Number of cores to use for multiprocessing. \n
    :param verbose: Print additional information.
    """

    warnings.filterwarnings("ignore")

    def __init__(self, windows: list, ticker_col: str = "ticker", close_col: str = "close", num_cores: int = None, verbose=True):
        super().__init__()
        self.windows = windows
        self.ticker_col = ticker_col
        self.close_col = close_col
        self.num_cores = num_cores if num_cores else os.cpu_count()
        self.verbose = verbose

    def transform(self, dataf: pd.DataFrame) -> np.array:
        """
        Multiprocessing feature engineering.

        :param dataf: DataFrame with columns: [ticker, date, open, high, low, close, volume] \n
        """
        tickers = dataf.loc[:, self.ticker_col].unique().tolist()
        if self.verbose:
            print(f"Feature engineering for {len(tickers)} tickers using {self.num_cores} CPU cores.")
        dataf_list = [x for _, x in tqdm(dataf.groupby(self.ticker_col), desc="Generating ticker DataFrames")]
        dataf = self._generate_features(dataf_list=dataf_list)
        output_cols = self.get_feature_names_out()
        return dataf[output_cols].to_numpy()

    def feature_engineering(self, dataf: pd.DataFrame) -> pd.DataFrame:
        """Feature engineering for single ticker."""
        close_series = dataf.loc[:, self.close_col]
        for x in self.windows:
            dataf.loc[:, f"feature_{self.close_col}_ROCP_{x}"] = close_series.pct_change(x)

            dataf.loc[:, f"feature_{self.close_col}_VOL_{x}"] = np.log1p(close_series).pct_change().rolling(x).std()

            dataf.loc[:, f"feature_{self.close_col}_MA_gap_{x}"] = close_series / close_series.rolling(x).mean()

        dataf.loc[:, "feature_RSI"] = self._rsi(close_series)
        macd, macd_signal = self._macd(close_series)
        dataf.loc[:, "feature_MACD"] = macd
        dataf.loc[:, "feature_MACD_signal"] = macd_signal
        return dataf

    def _generate_features(self, dataf_list: list) -> pd.DataFrame:
        """Add features for list of ticker DataFrames and concatenate."""
        with Pool(self.num_cores) as p:
            feature_datafs = list(
                tqdm(
                    p.imap(self.feature_engineering, dataf_list),
                    desc="Generating features",
                    total=len(dataf_list),
                )
            )
        return pd.concat(feature_datafs)

    @staticmethod
    def _rsi(close: pd.Series, period: int = 14) -> pd.Series:
        """
        See source https://github.com/peerchemist/finta
        and fix https://www.tradingview.com/wiki/Talk:Relative_Strength_Index_(RSI)
        """
        delta = close.diff()
        up, down = delta.copy(), delta.copy()
        up[up < 0] = 0
        down[down > 0] = 0

        gain = up.ewm(com=(period - 1), min_periods=period).mean()
        loss = down.abs().ewm(com=(period - 1), min_periods=period).mean()

        rs = gain / loss
        return pd.Series(100 - (100 / (1 + rs)))

    def _macd(self, close: pd.Series, span1=12, span2=26, span3=9) -> Tuple[pd.Series, pd.Series]:
        """Compute MACD and MACD signal."""
        exp1 = self.__ema1(close, span1)
        exp2 = self.__ema1(close, span2)
        macd = 100 * (exp1 - exp2) / exp2
        signal = self.__ema1(macd, span3)
        return macd, signal

    @staticmethod
    def __ema1(series: pd.Series, span: int) -> pd.Series:
        """Exponential moving average"""
        a = 2 / (span + 1)
        return series.ewm(alpha=a).mean()

    def get_feature_names_out(self, input_features=None) -> List[str]:
        """Return feature names."""
        if not input_features:
            feature_names = []
            for x in self.windows:
                feature_names += [
                    f"feature_{self.close_col}_ROCP_{x}",
                    f"feature_{self.close_col}_VOL_{x}",
                    f"feature_{self.close_col}_MA_gap_{x}",
                ]
            feature_names += [
                "feature_RSI",
                "feature_MACD",
                "feature_MACD_signal",
            ]
        else:
            feature_names = input_features
        return feature_names

`__ema1(series, span)` `staticmethod`

Exponential moving average

Source code in src/numerblox/preprocessing/signals.py

@staticmethod
def __ema1(series: pd.Series, span: int) -> pd.Series:
    """Exponential moving average"""
    a = 2 / (span + 1)
    return series.ewm(alpha=a).mean()

`_generate_features(dataf_list)`

Add features for list of ticker DataFrames and concatenate.

Source code in src/numerblox/preprocessing/signals.py

def _generate_features(self, dataf_list: list) -> pd.DataFrame:
    """Add features for list of ticker DataFrames and concatenate."""
    with Pool(self.num_cores) as p:
        feature_datafs = list(
            tqdm(
                p.imap(self.feature_engineering, dataf_list),
                desc="Generating features",
                total=len(dataf_list),
            )
        )
    return pd.concat(feature_datafs)

`_macd(close, span1=12, span2=26, span3=9)`

Compute MACD and MACD signal.

Source code in src/numerblox/preprocessing/signals.py

def _macd(self, close: pd.Series, span1=12, span2=26, span3=9) -> Tuple[pd.Series, pd.Series]:
    """Compute MACD and MACD signal."""
    exp1 = self.__ema1(close, span1)
    exp2 = self.__ema1(close, span2)
    macd = 100 * (exp1 - exp2) / exp2
    signal = self.__ema1(macd, span3)
    return macd, signal

`_rsi(close, period=14)` `staticmethod`

See source https://github.com/peerchemist/finta and fix https://www.tradingview.com/wiki/Talk:Relative_Strength_Index_(RSI)

Source code in src/numerblox/preprocessing/signals.py

@staticmethod
def _rsi(close: pd.Series, period: int = 14) -> pd.Series:
    """
    See source https://github.com/peerchemist/finta
    and fix https://www.tradingview.com/wiki/Talk:Relative_Strength_Index_(RSI)
    """
    delta = close.diff()
    up, down = delta.copy(), delta.copy()
    up[up < 0] = 0
    down[down > 0] = 0

    gain = up.ewm(com=(period - 1), min_periods=period).mean()
    loss = down.abs().ewm(com=(period - 1), min_periods=period).mean()

    rs = gain / loss
    return pd.Series(100 - (100 / (1 + rs)))

`feature_engineering(dataf)`

Feature engineering for single ticker.

Source code in src/numerblox/preprocessing/signals.py

def feature_engineering(self, dataf: pd.DataFrame) -> pd.DataFrame:
    """Feature engineering for single ticker."""
    close_series = dataf.loc[:, self.close_col]
    for x in self.windows:
        dataf.loc[:, f"feature_{self.close_col}_ROCP_{x}"] = close_series.pct_change(x)

        dataf.loc[:, f"feature_{self.close_col}_VOL_{x}"] = np.log1p(close_series).pct_change().rolling(x).std()

        dataf.loc[:, f"feature_{self.close_col}_MA_gap_{x}"] = close_series / close_series.rolling(x).mean()

    dataf.loc[:, "feature_RSI"] = self._rsi(close_series)
    macd, macd_signal = self._macd(close_series)
    dataf.loc[:, "feature_MACD"] = macd
    dataf.loc[:, "feature_MACD_signal"] = macd_signal
    return dataf

`get_feature_names_out(input_features=None)`

Return feature names.

Source code in src/numerblox/preprocessing/signals.py

def get_feature_names_out(self, input_features=None) -> List[str]:
    """Return feature names."""
    if not input_features:
        feature_names = []
        for x in self.windows:
            feature_names += [
                f"feature_{self.close_col}_ROCP_{x}",
                f"feature_{self.close_col}_VOL_{x}",
                f"feature_{self.close_col}_MA_gap_{x}",
            ]
        feature_names += [
            "feature_RSI",
            "feature_MACD",
            "feature_MACD_signal",
        ]
    else:
        feature_names = input_features
    return feature_names

`transform(dataf)`

Multiprocessing feature engineering.

:param dataf: DataFrame with columns: [ticker, date, open, high, low, close, volume]

Source code in src/numerblox/preprocessing/signals.py

def transform(self, dataf: pd.DataFrame) -> np.array:
    """
    Multiprocessing feature engineering.

    :param dataf: DataFrame with columns: [ticker, date, open, high, low, close, volume] \n
    """
    tickers = dataf.loc[:, self.ticker_col].unique().tolist()
    if self.verbose:
        print(f"Feature engineering for {len(tickers)} tickers using {self.num_cores} CPU cores.")
    dataf_list = [x for _, x in tqdm(dataf.groupby(self.ticker_col), desc="Generating ticker DataFrames")]
    dataf = self._generate_features(dataf_list=dataf_list)
    output_cols = self.get_feature_names_out()
    return dataf[output_cols].to_numpy()

`LagPreProcessor`

Bases: BasePreProcessor

Add lag features based on given windows.

:param windows: All lag windows to process for all features.

[5, 10, 15, 20] by default (4 weeks lookback)

Source code in src/numerblox/preprocessing/signals.py

class LagPreProcessor(BasePreProcessor):
    """
    Add lag features based on given windows.

    :param windows: All lag windows to process for all features. \n
    [5, 10, 15, 20] by default (4 weeks lookback) \n
    """

    def __init__(
        self,
        windows: list = None,
    ):
        super().__init__()
        self.windows = windows if windows else [5, 10, 15, 20]
        # Metadata routing
        self.set_transform_request(ticker_series=True)

    def transform(self, X: Union[np.array, pd.DataFrame], ticker_series: pd.Series = None) -> np.array:
        if ticker_series is None:
            warnings.warn("""WARNING: 'era_series' not provided for 
                          LagPreProcessor! Lags will be treated as if 'X' is 1
                          era of data. Ensure you are not passing multiple eras to LagPreProcessor in this way! Not providing 'era_series' is valid for live inference, where only one era is used for creating lags.""")
        else:
            assert X.shape[0] == ticker_series.shape[0], "Input X and ticker_series must have the same number of rows for lag generation."

        X = pd.DataFrame(X)
        feature_cols = X.columns.tolist()
        X["ticker"] = ticker_series if ticker_series is not None else "XXXXXXXXXXXXXXXXXXXXXX"
        ticker_groups = X.groupby("ticker")
        output_features = []
        for feature in tqdm(feature_cols, desc="Lag feature generation"):
            feature_group = ticker_groups[feature]
            for day in self.windows:
                shifted = feature_group.shift(day)
                X.loc[:, f"{feature}_lag{day}"] = shifted
                output_features.append(f"{feature}_lag{day}")
        self.output_features = output_features
        return X[output_features].to_numpy()

    def fit_transform(self, X: Union[np.array, pd.DataFrame], ticker_series: pd.Series):
        self.fit(X=X)
        return self.transform(X=X, ticker_series=ticker_series)

    def get_feature_names_out(self, input_features=None) -> List[str]:
        """Return feature names."""
        return self.output_features if not input_features else input_features

`get_feature_names_out(input_features=None)`

Return feature names.

Source code in src/numerblox/preprocessing/signals.py

def get_feature_names_out(self, input_features=None) -> List[str]:
    """Return feature names."""
    return self.output_features if not input_features else input_features

`MinimumDataFilter`

Bases: BasePreProcessor

Filter dates and tickers based on minimum data requirements. NOTE: This step only works with DataFrame input.

:param min_samples_date: Minimum number of samples per date. Defaults to 200. :param min_samples_ticker: Minimum number of samples per ticker. Defaults to 1200. :param blacklist_tickers: List of tickers to exclude from the dataset. Defaults to None. :param date_col: Column name for date. Defaults to "date". :param ticker_col: Column name for ticker. Defaults to "bloomberg_ticker".

Source code in src/numerblox/preprocessing/signals.py

class MinimumDataFilter(BasePreProcessor):
    """
    Filter dates and tickers based on minimum data requirements.
    NOTE: This step only works with DataFrame input.

    :param min_samples_date: Minimum number of samples per date. Defaults to 200.
    :param min_samples_ticker: Minimum number of samples per ticker. Defaults to 1200.
    :param blacklist_tickers: List of tickers to exclude from the dataset. Defaults to None.
    :param date_col: Column name for date. Defaults to "date".
    :param ticker_col: Column name for ticker. Defaults to "bloomberg_ticker".
    """

    def __init__(self, min_samples_date: int = 200, min_samples_ticker: int = 1200, blacklist_tickers: list = None, date_col="date", ticker_col="bloomberg_ticker"):
        super().__init__()
        self.min_samples_date = min_samples_date
        self.min_samples_ticker = min_samples_ticker
        self.blacklist_tickers = blacklist_tickers
        self.date_col = date_col
        self.ticker_col = ticker_col

    def fit(self, X: pd.DataFrame, y=None):
        self.feature_names_out_ = X.columns.tolist()
        self.is_fitted_ = True
        return self

    def transform(self, X: pd.DataFrame) -> np.array:
        """
        Filter dates and tickers based on minimum data requirements.
        :param X: DataFrame with columns: [ticker_col, date_col, open, high, low, close, volume] (HLOCV)
        :return: Array with filtered DataFrame
        """
        filtered_data = X.groupby(self.date_col).filter(lambda x: len(x) >= self.min_samples_date)
        records_per_ticker = filtered_data.reset_index(drop=False).groupby(self.ticker_col)[self.date_col].nunique().reset_index().sort_values(by=self.date_col)
        tickers_with_records = records_per_ticker.query(f"{self.date_col} >= {self.min_samples_ticker}")[self.ticker_col].values
        filtered_data = filtered_data.loc[filtered_data[self.ticker_col].isin(tickers_with_records)].reset_index(drop=True)

        if self.blacklist_tickers:
            filtered_data = filtered_data.loc[~filtered_data[self.ticker_col].isin(self.blacklist_tickers)]

        return filtered_data.to_numpy()

    def get_feature_names_out(self, input_features=None) -> List[str]:
        check_is_fitted(self)
        return self.feature_names_out_ if not input_features else input_features

`transform(X)`

Filter dates and tickers based on minimum data requirements. :param X: DataFrame with columns: [ticker_col, date_col, open, high, low, close, volume] (HLOCV) :return: Array with filtered DataFrame

Source code in src/numerblox/preprocessing/signals.py

def transform(self, X: pd.DataFrame) -> np.array:
    """
    Filter dates and tickers based on minimum data requirements.
    :param X: DataFrame with columns: [ticker_col, date_col, open, high, low, close, volume] (HLOCV)
    :return: Array with filtered DataFrame
    """
    filtered_data = X.groupby(self.date_col).filter(lambda x: len(x) >= self.min_samples_date)
    records_per_ticker = filtered_data.reset_index(drop=False).groupby(self.ticker_col)[self.date_col].nunique().reset_index().sort_values(by=self.date_col)
    tickers_with_records = records_per_ticker.query(f"{self.date_col} >= {self.min_samples_ticker}")[self.ticker_col].values
    filtered_data = filtered_data.loc[filtered_data[self.ticker_col].isin(tickers_with_records)].reset_index(drop=True)

    if self.blacklist_tickers:
        filtered_data = filtered_data.loc[~filtered_data[self.ticker_col].isin(self.blacklist_tickers)]

    return filtered_data.to_numpy()

`PandasTaFeatureGenerator`

Bases: BasePreProcessor

Generate features with pandas-ta. https://github.com/twopirllc/pandas-ta Usage in Pipeline works only with Pandas API. Run .set_output("pandas") on your pipeline first.

:param strategy: Valid Pandas Ta strategy.

For more information on creating a strategy, see:

https://github.com/twopirllc/pandas-ta#pandas-ta-strategy

By default, a strategy with RSI(14) and RSI(60) is used.

:param ticker_col: Column name for grouping by tickers.

:param num_cores: Number of cores to use for multiprocessing.

By default, all available cores are used.

Source code in src/numerblox/preprocessing/signals.py

class PandasTaFeatureGenerator(BasePreProcessor):
    """
    Generate features with pandas-ta.
    https://github.com/twopirllc/pandas-ta
    Usage in Pipeline works only with Pandas API.
    Run `.set_output("pandas")` on your pipeline first.

    :param strategy: Valid Pandas Ta strategy. \n
    For more information on creating a strategy, see: \n
    https://github.com/twopirllc/pandas-ta#pandas-ta-strategy \n
    By default, a strategy with RSI(14) and RSI(60) is used. \n
    :param ticker_col: Column name for grouping by tickers. \n
    :param num_cores: Number of cores to use for multiprocessing. \n
    By default, all available cores are used. \n
    """

    def __init__(
        self,
        strategy: ta.Strategy = None,
        ticker_col: str = "ticker",
        num_cores: int = None,
    ):
        super().__init__()
        self.ticker_col = ticker_col
        self.num_cores = num_cores if num_cores else os.cpu_count()
        standard_strategy = ta.Strategy(name="standard", ta=[{"kind": "rsi", "length": 14, "col_names": ("feature_RSI_14")}, {"kind": "rsi", "length": 60, "col_names": ("feature_RSI_60")}])
        self.strategy = strategy if strategy is not None else standard_strategy

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Main feature generation method. \n
        :param X: DataFrame with columns: [ticker, date, open, high, low, close, volume] \n
        :return: PandasTA features
        """
        initial_features = X.columns.tolist()
        dataf_list = [x for _, x in tqdm(X.groupby(self.ticker_col), desc="Generating ticker DataFrames")]
        X = self._generate_features(dataf_list=dataf_list)
        output_df = X.drop(columns=initial_features)
        self.output_cols = output_df.columns.tolist()
        return output_df

    def _generate_features(self, dataf_list: List[pd.DataFrame]) -> pd.DataFrame:
        """
        Add features for list of ticker DataFrames and concatenate.
        :param dataf_list: List of DataFrames for each ticker.
        :return: Concatenated DataFrame for all full list with features added.
        """
        with Pool(self.num_cores) as p:
            feature_datafs = list(
                tqdm(
                    p.imap(self.add_features, dataf_list),
                    desc="Generating pandas-ta features",
                    total=len(dataf_list),
                )
            )
        return pd.concat(feature_datafs)

    def add_features(self, ticker_df: pd.DataFrame) -> pd.DataFrame:
        """
        The TA strategy is applied to the DataFrame here.
        :param ticker_df: DataFrame for a single ticker.
        :return: DataFrame with features added.
        """
        # We use a different multiprocessing engine so shutting off pandas_ta's multiprocessing
        ticker_df.ta.cores = 0
        # Run strategy
        ticker_df.ta.strategy(self.strategy)
        return ticker_df

    def get_feature_names_out(self, input_features=None) -> List[str]:
        return self.output_cols if not input_features else input_features

`_generate_features(dataf_list)`

Add features for list of ticker DataFrames and concatenate. :param dataf_list: List of DataFrames for each ticker. :return: Concatenated DataFrame for all full list with features added.

Source code in src/numerblox/preprocessing/signals.py

def _generate_features(self, dataf_list: List[pd.DataFrame]) -> pd.DataFrame:
    """
    Add features for list of ticker DataFrames and concatenate.
    :param dataf_list: List of DataFrames for each ticker.
    :return: Concatenated DataFrame for all full list with features added.
    """
    with Pool(self.num_cores) as p:
        feature_datafs = list(
            tqdm(
                p.imap(self.add_features, dataf_list),
                desc="Generating pandas-ta features",
                total=len(dataf_list),
            )
        )
    return pd.concat(feature_datafs)

`add_features(ticker_df)`

The TA strategy is applied to the DataFrame here. :param ticker_df: DataFrame for a single ticker. :return: DataFrame with features added.

Source code in src/numerblox/preprocessing/signals.py

def add_features(self, ticker_df: pd.DataFrame) -> pd.DataFrame:
    """
    The TA strategy is applied to the DataFrame here.
    :param ticker_df: DataFrame for a single ticker.
    :return: DataFrame with features added.
    """
    # We use a different multiprocessing engine so shutting off pandas_ta's multiprocessing
    ticker_df.ta.cores = 0
    # Run strategy
    ticker_df.ta.strategy(self.strategy)
    return ticker_df

`transform(X)`

Main feature generation method.

:param X: DataFrame with columns: [ticker, date, open, high, low, close, volume]

:return: PandasTA features

Source code in src/numerblox/preprocessing/signals.py

def transform(self, X: pd.DataFrame) -> pd.DataFrame:
    """
    Main feature generation method. \n
    :param X: DataFrame with columns: [ticker, date, open, high, low, close, volume] \n
    :return: PandasTA features
    """
    initial_features = X.columns.tolist()
    dataf_list = [x for _, x in tqdm(X.groupby(self.ticker_col), desc="Generating ticker DataFrames")]
    X = self._generate_features(dataf_list=dataf_list)
    output_df = X.drop(columns=initial_features)
    self.output_cols = output_df.columns.tolist()
    return output_df

`ReduceMemoryProcessor`

Bases: BasePreProcessor

Reduce memory usage as much as possible.

Credits to kainsama and others for writing about memory usage reduction for Numerai data: https://forum.numer.ai/t/reducing-memory/313

:param deep_mem_inspect: Introspect the data deeply by interrogating object dtypes. Yields a more accurate representation of memory usage if you have complex object columns. :param verbose: Print memory usage before and after optimization.

Source code in src/numerblox/preprocessing/signals.py

class ReduceMemoryProcessor(BasePreProcessor):
    """
    Reduce memory usage as much as possible.

    Credits to kainsama and others for writing about memory usage reduction for Numerai data:
    https://forum.numer.ai/t/reducing-memory/313

    :param deep_mem_inspect: Introspect the data deeply by interrogating object dtypes.
    Yields a more accurate representation of memory usage if you have complex object columns.
    :param verbose: Print memory usage before and after optimization.
    """

    def __init__(self, deep_mem_inspect=False, verbose=True):
        super().__init__()
        self.deep_mem_inspect = deep_mem_inspect
        self.verbose = verbose

    def transform(self, dataf: Union[np.array, pd.DataFrame]) -> np.array:
        return self._reduce_mem_usage(dataf).to_numpy()

    def _reduce_mem_usage(self, dataf: Union[np.array, pd.DataFrame]) -> pd.DataFrame:
        """
        Iterate through all columns and modify the numeric column types
        to reduce memory usage.
        """
        dataf = pd.DataFrame(dataf)
        self.output_cols = dataf.columns.tolist()
        start_memory_usage = dataf.memory_usage(deep=self.deep_mem_inspect).sum() / 1024**2
        if self.verbose:
            print(f"Memory usage of DataFrame is {round(start_memory_usage, 2)} MB")

        for col in dataf.columns:
            col_type = dataf[col].dtype.name

            if col_type not in [
                "object",
                "category",
                "datetime64[ns, UTC]",
                "datetime64[ns]",
            ]:
                c_min = dataf[col].min()
                c_max = dataf[col].max()
                if str(col_type)[:3] == "int":
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        dataf[col] = dataf[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        dataf[col] = dataf[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        dataf[col] = dataf[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        dataf[col] = dataf[col].astype(np.int64)
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        dataf[col] = dataf[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        dataf[col] = dataf[col].astype(np.float32)
                    else:
                        dataf[col] = dataf[col].astype(np.float64)

        end_memory_usage = dataf.memory_usage(deep=self.deep_mem_inspect).sum() / 1024**2
        if self.verbose:
            print(f"Memory usage after optimization is: {round(end_memory_usage, 2)} MB")
            print(f"Usage decreased by {round(100 * (start_memory_usage - end_memory_usage) / start_memory_usage, 2)}%")
        return dataf

    def get_feature_names_out(self, input_features=None) -> List[str]:
        """Return feature names."""
        return self.output_cols if not input_features else input_features

`_reduce_mem_usage(dataf)`

Iterate through all columns and modify the numeric column types to reduce memory usage.

Source code in src/numerblox/preprocessing/signals.py

def _reduce_mem_usage(self, dataf: Union[np.array, pd.DataFrame]) -> pd.DataFrame:
    """
    Iterate through all columns and modify the numeric column types
    to reduce memory usage.
    """
    dataf = pd.DataFrame(dataf)
    self.output_cols = dataf.columns.tolist()
    start_memory_usage = dataf.memory_usage(deep=self.deep_mem_inspect).sum() / 1024**2
    if self.verbose:
        print(f"Memory usage of DataFrame is {round(start_memory_usage, 2)} MB")

    for col in dataf.columns:
        col_type = dataf[col].dtype.name

        if col_type not in [
            "object",
            "category",
            "datetime64[ns, UTC]",
            "datetime64[ns]",
        ]:
            c_min = dataf[col].min()
            c_max = dataf[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    dataf[col] = dataf[col].astype(np.int16)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    dataf[col] = dataf[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    dataf[col] = dataf[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    dataf[col] = dataf[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    dataf[col] = dataf[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    dataf[col] = dataf[col].astype(np.float32)
                else:
                    dataf[col] = dataf[col].astype(np.float64)

    end_memory_usage = dataf.memory_usage(deep=self.deep_mem_inspect).sum() / 1024**2
    if self.verbose:
        print(f"Memory usage after optimization is: {round(end_memory_usage, 2)} MB")
        print(f"Usage decreased by {round(100 * (start_memory_usage - end_memory_usage) / start_memory_usage, 2)}%")
    return dataf

`get_feature_names_out(input_features=None)`

Return feature names.

Source code in src/numerblox/preprocessing/signals.py

def get_feature_names_out(self, input_features=None) -> List[str]:
    """Return feature names."""
    return self.output_cols if not input_features else input_features

`CrossValEstimator`

Bases: TransformerMixin, BaseEstimator

Split your data into multiple folds and fit an estimator on each fold. For transforms predictions are concatenated into a 2D array. :param cv: Cross validation object that follows scikit-learn conventions. :param estimator: Estimator to fit on each fold. :param evaluation_func: Custom evaluation logic that is executed on validation data for each fold. Must accepts as input y_true and y_pred. For example, evaluation_func can handle logging metrics for each fold. Anything that evaluation_func returns is stored in self.eval_results_. :param predict_func: Name of the function that will be used for prediction. Must be one of 'predict', 'predict_proba', 'predict_log_proba'. For example, XGBRegressor has 'predict' and 'predict_proba' functions. :param verbose: Whether to print progress.

Source code in src/numerblox/meta.py

class CrossValEstimator(TransformerMixin, BaseEstimator):
    """
    Split your data into multiple folds and fit an estimator on each fold.
    For transforms predictions are concatenated into a 2D array.
    :param cv: Cross validation object that follows scikit-learn conventions.
    :param estimator: Estimator to fit on each fold.
    :param evaluation_func: Custom evaluation logic that is executed on validation data for each fold. Must accepts as input y_true and y_pred.
    For example, evaluation_func can handle logging metrics for each fold.
    Anything that evaluation_func returns is stored in `self.eval_results_`.
    :param predict_func: Name of the function that will be used for prediction.
    Must be one of 'predict', 'predict_proba', 'predict_log_proba'.
    For example, XGBRegressor has 'predict' and 'predict_proba' functions.
    :param verbose: Whether to print progress.
    """

    def __init__(self, estimator: BaseEstimator, cv: BaseCrossValidator, evaluation_func=None, predict_func="predict", verbose=False):
        sklearn.set_config(enable_metadata_routing=True)
        super().__init__()
        self.cv = cv
        if not hasattr(self.cv, "split") or isinstance(self.cv, str):
            raise ValueError("cv must be a valid sklearn cv object withat least a 'split' function.")
        self.estimator = estimator
        self.estimator_name = estimator.__class__.__name__
        self.evaluation_func = evaluation_func

        if predict_func not in ["predict", "predict_proba", "predict_log_proba"]:
            raise ValueError("predict_func must be 'predict', 'predict_proba', or 'predict_log_proba'.")
        self.predict_func = predict_func
        assert hasattr(self.estimator, self.predict_func), f"Estimator {self.estimator_name} does not have {self.predict_func} function."
        self.verbose = verbose

    def fit(self, X: Union[np.array, pd.DataFrame], y: Union[np.array, pd.Series], **kwargs):
        """Use cross validation object to fit estimators."""
        self.estimators_ = []
        self.eval_results_ = []
        if isinstance(X, (pd.Series, pd.DataFrame)):
            X = X.reset_index(drop=True).values
        if isinstance(y, (pd.Series, pd.DataFrame)):
            y = y.reset_index(drop=True).values
        for i, (train_idx, val_idx) in tqdm(enumerate(self.cv.split(X, y)), desc=f"CrossValEstimator Fitting. Estimator='{self.estimator_name}'", total=self.cv.get_n_splits(), disable=not self.verbose):
            estimator = clone(self.estimator)
            if self.verbose:
                print(f"Fitting {self.estimator_name} on fold {len(self.estimators_)}")

            estimator.fit(X[train_idx], y[train_idx], **kwargs)

            # Execute custom evaluation logic
            if self.evaluation_func:
                if self.verbose:
                    print(f"Running evaluation on fold {len(self.estimators_)}")

                y_pred = getattr(estimator, self.predict_func)(X[val_idx])
                y_pred = self._postprocess_pred(y_pred)
                eval_fold = self.evaluation_func(y[val_idx], y_pred)
                if self.verbose:
                    print(f"CrossValEstimator (estimator='{self.estimator_name}'): Fold '{i}' evaluation results: '{eval_fold}'")
                self.eval_results_.append(eval_fold)

            self.estimators_.append(estimator)

            # Store output shape by doing inference on 1st sample of training set
            if i == 0:
                sample_prediction = getattr(estimator, self.predict_func)(X[train_idx][:1])
                sample_prediction = self._postprocess_pred(sample_prediction)
                self.output_shape_ = sample_prediction.shape[1:]
                self.multi_output_ = len(y.shape) > 1
                self.n_outputs_per_model_ = np.prod(self.output_shape_).astype(int)
        return self

    def transform(self, X, model_idxs: List[int] = None, **kwargs) -> np.array:
        """
        Use cross validation object to transform estimators.
        :param X: Input data for inference.
        :param y: Target data for inference.
        :param model_idxs: List of indices of models to use for inference.
        By default, all fitted models are used.
        :param kwargs: Additional arguments to pass to the estimator's predict function.
        """
        check_is_fitted(self)
        inference_estimators = [self.estimators_[i] for i in model_idxs] if model_idxs else self.estimators_

        # Create an empty array to store predictions
        final_predictions = np.zeros((X.shape[0], len(inference_estimators) * self.n_outputs_per_model_))
        # Iterate through models to get predictions
        for idx, estimator in enumerate(inference_estimators):
            pred = getattr(estimator, self.predict_func)(X, **kwargs)
            pred = self._postprocess_pred(pred)

            # Calculate where to place these predictions in the final array
            start_idx = idx * self.n_outputs_per_model_
            end_idx = (idx + 1) * self.n_outputs_per_model_

            final_predictions[:, start_idx:end_idx] = pred

        return final_predictions

    def predict(self, X, model_idxs: List[int] = None, **kwargs) -> np.array:
        return self.transform(X, model_idxs, **kwargs)

    def get_feature_names_out(self, input_features=None) -> List[str]:
        check_is_fitted(self)
        base_str = f"CrossValEstimator_{self.estimator_name}_{self.predict_func}"
        # Single-output case
        if self.n_outputs_per_model_ == 1:
            feature_names = [f"{base_str}_{i}" for i in range(len(self.estimators_))]
        # Multi-output case
        else:
            feature_names = []
            for i in range(len(self.estimators_)):
                for j in range(self.n_outputs_per_model_):
                    feature_names.append(f"{base_str}_{i}_output_{j}")
        return feature_names

    def _postprocess_pred(self, pred):
        # Make sure predictions are 2D
        if len(pred.shape) == 1:
            pred = pred.reshape(-1, 1)
        return pred

    def __sklearn_is_fitted__(self) -> bool:
        """Check fitted status."""
        # Must have a fitted estimator for each split.
        return len(self.estimators_) == self.cv.get_n_splits()

`__sklearn_is_fitted__()`

Check fitted status.

Source code in src/numerblox/meta.py

def __sklearn_is_fitted__(self) -> bool:
    """Check fitted status."""
    # Must have a fitted estimator for each split.
    return len(self.estimators_) == self.cv.get_n_splits()

`fit(X, y, **kwargs)`

Use cross validation object to fit estimators.

Source code in src/numerblox/meta.py

def fit(self, X: Union[np.array, pd.DataFrame], y: Union[np.array, pd.Series], **kwargs):
    """Use cross validation object to fit estimators."""
    self.estimators_ = []
    self.eval_results_ = []
    if isinstance(X, (pd.Series, pd.DataFrame)):
        X = X.reset_index(drop=True).values
    if isinstance(y, (pd.Series, pd.DataFrame)):
        y = y.reset_index(drop=True).values
    for i, (train_idx, val_idx) in tqdm(enumerate(self.cv.split(X, y)), desc=f"CrossValEstimator Fitting. Estimator='{self.estimator_name}'", total=self.cv.get_n_splits(), disable=not self.verbose):
        estimator = clone(self.estimator)
        if self.verbose:
            print(f"Fitting {self.estimator_name} on fold {len(self.estimators_)}")

        estimator.fit(X[train_idx], y[train_idx], **kwargs)

        # Execute custom evaluation logic
        if self.evaluation_func:
            if self.verbose:
                print(f"Running evaluation on fold {len(self.estimators_)}")

            y_pred = getattr(estimator, self.predict_func)(X[val_idx])
            y_pred = self._postprocess_pred(y_pred)
            eval_fold = self.evaluation_func(y[val_idx], y_pred)
            if self.verbose:
                print(f"CrossValEstimator (estimator='{self.estimator_name}'): Fold '{i}' evaluation results: '{eval_fold}'")
            self.eval_results_.append(eval_fold)

        self.estimators_.append(estimator)

        # Store output shape by doing inference on 1st sample of training set
        if i == 0:
            sample_prediction = getattr(estimator, self.predict_func)(X[train_idx][:1])
            sample_prediction = self._postprocess_pred(sample_prediction)
            self.output_shape_ = sample_prediction.shape[1:]
            self.multi_output_ = len(y.shape) > 1
            self.n_outputs_per_model_ = np.prod(self.output_shape_).astype(int)
    return self

`transform(X, model_idxs=None, **kwargs)`

Use cross validation object to transform estimators. :param X: Input data for inference. :param y: Target data for inference. :param model_idxs: List of indices of models to use for inference. By default, all fitted models are used. :param kwargs: Additional arguments to pass to the estimator's predict function.

Source code in src/numerblox/meta.py

def transform(self, X, model_idxs: List[int] = None, **kwargs) -> np.array:
    """
    Use cross validation object to transform estimators.
    :param X: Input data for inference.
    :param y: Target data for inference.
    :param model_idxs: List of indices of models to use for inference.
    By default, all fitted models are used.
    :param kwargs: Additional arguments to pass to the estimator's predict function.
    """
    check_is_fitted(self)
    inference_estimators = [self.estimators_[i] for i in model_idxs] if model_idxs else self.estimators_

    # Create an empty array to store predictions
    final_predictions = np.zeros((X.shape[0], len(inference_estimators) * self.n_outputs_per_model_))
    # Iterate through models to get predictions
    for idx, estimator in enumerate(inference_estimators):
        pred = getattr(estimator, self.predict_func)(X, **kwargs)
        pred = self._postprocess_pred(pred)

        # Calculate where to place these predictions in the final array
        start_idx = idx * self.n_outputs_per_model_
        end_idx = (idx + 1) * self.n_outputs_per_model_

        final_predictions[:, start_idx:end_idx] = pred

    return final_predictions

`MetaEstimator`

Bases: MetaEstimatorMixin, TransformerMixin, BaseEstimator

Helper for NumeraiPipeline and NumeraiFeatureUnion to use a model as a transformer.

:param estimator: Underlying estimator like XGBoost, Catboost, scikit-learn, etc. :param predict_func: Name of the function that will be used for prediction. Must be one of 'predict', 'predict_proba', 'predict_log_proba'. For example, XGBRegressor has 'predict' and 'predict_proba' functions. :param model_type: "regressor" or "classifier". Used to determine if the estimator is multi output.

Source code in src/numerblox/meta.py

class MetaEstimator(MetaEstimatorMixin, TransformerMixin, BaseEstimator):
    """
    Helper for NumeraiPipeline and NumeraiFeatureUnion to use a model as a transformer.

    :param estimator: Underlying estimator like XGBoost, Catboost, scikit-learn, etc.
    :param predict_func: Name of the function that will be used for prediction.
    Must be one of 'predict', 'predict_proba', 'predict_log_proba'.
    For example, XGBRegressor has 'predict' and 'predict_proba' functions.
    :param model_type: "regressor" or "classifier". Used to determine if the estimator is multi output.
    """

    def __init__(self, estimator, predict_func="predict", model_type="regressor"):
        sklearn.set_config(enable_metadata_routing=True)
        self.estimator = estimator
        if predict_func not in ["predict", "predict_proba", "predict_log_proba", "transform"]:
            raise ValueError("predict_func must be 'predict', 'predict_proba', 'predict_log_proba' or 'transform'.")
        self.predict_func = predict_func
        assert model_type in ["regressor", "classifier"], "model_type must be 'regressor' or 'classifier'."
        assert hasattr(self.estimator, self.predict_func), f"Estimator {self.estimator.__class__.__name__} does not have {self.predict_func} function."
        self.model_type = model_type
        # predict_proba for classifiers -> multi output
        self.proba_class_ = predict_func == "predict_proba" and model_type == "classifier"

    def fit(self, X: Union[np.array, pd.DataFrame], y, **kwargs):
        """
        Fit underlying estimator and set attributes.
        """
        X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES, multi_output=True)
        # Either multi target or outputs are probabilities
        self.multi_output_ = len(y.shape) > 1 or self.proba_class_
        self.estimator_ = clone(self.estimator)
        self.estimator_.fit(X, y, **kwargs)
        return self

    def transform(self, X: Union[np.array, pd.DataFrame], **kwargs) -> np.array:
        """
        Apply the `predict_func` on the fitted estimator.

        Shape `(X.shape[0], )` if estimator is not multi output and else `(X.shape[0], y.shape[1])`.
        All additional kwargs are passed to the underlying estimator's predict function.
        """
        check_is_fitted(self, "estimator_")
        output = getattr(self.estimator_, self.predict_func)(X, **kwargs)
        return output if self.multi_output_ else output.reshape(-1, 1)

    def predict(self, X: Union[np.array, pd.DataFrame], **kwargs) -> np.array:
        """
        For if a MetaEstimator happens to be the last step in the pipeline. Has same behavior as transform.
        """
        return self.transform(X, **kwargs)

    def get_feature_names_out(self, input_features=None) -> List[str]:
        check_is_fitted(self)
        feature_names = [f"{self.estimator.__class__.__name__}_{self.predict_func}_output"]
        return feature_names if not input_features else input_features

`fit(X, y, **kwargs)`

Fit underlying estimator and set attributes.

Source code in src/numerblox/meta.py

def fit(self, X: Union[np.array, pd.DataFrame], y, **kwargs):
    """
    Fit underlying estimator and set attributes.
    """
    X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES, multi_output=True)
    # Either multi target or outputs are probabilities
    self.multi_output_ = len(y.shape) > 1 or self.proba_class_
    self.estimator_ = clone(self.estimator)
    self.estimator_.fit(X, y, **kwargs)
    return self

`predict(X, **kwargs)`

For if a MetaEstimator happens to be the last step in the pipeline. Has same behavior as transform.

Source code in src/numerblox/meta.py

def predict(self, X: Union[np.array, pd.DataFrame], **kwargs) -> np.array:
    """
    For if a MetaEstimator happens to be the last step in the pipeline. Has same behavior as transform.
    """
    return self.transform(X, **kwargs)

`transform(X, **kwargs)`

Apply the predict_func on the fitted estimator.

Shape (X.shape[0], ) if estimator is not multi output and else (X.shape[0], y.shape[1]). All additional kwargs are passed to the underlying estimator's predict function.

Source code in src/numerblox/meta.py

def transform(self, X: Union[np.array, pd.DataFrame], **kwargs) -> np.array:
    """
    Apply the `predict_func` on the fitted estimator.

    Shape `(X.shape[0], )` if estimator is not multi output and else `(X.shape[0], y.shape[1])`.
    All additional kwargs are passed to the underlying estimator's predict function.
    """
    check_is_fitted(self, "estimator_")
    output = getattr(self.estimator_, self.predict_func)(X, **kwargs)
    return output if self.multi_output_ else output.reshape(-1, 1)

`MetaPipeline`

Bases: Pipeline

Pipeline which turns all estimators into transformers by wrapping them in MetaEstimator. This allows to have pipeline steps after models. For example, a FeatureNeutralizer after an XGBRegressor.

:param steps: List of (name, transform) tuples (implementing fit/transform) that are chained, in the order in which they are chained, with the last object an instance of BaseNeutralizer. :param memory: Used to cache the fitted transformers of the pipeline. :param verbose: If True, the time elapsed while fitting each step will be printed as it is completed. :param predict_func: Name of the function that will be used for prediction.

Source code in src/numerblox/meta.py

class MetaPipeline(Pipeline):
    """
    Pipeline which turns all estimators into transformers by wrapping them in MetaEstimator.
    This allows to have pipeline steps after models.
    For example, a FeatureNeutralizer after an XGBRegressor.

    :param steps: List of (name, transform) tuples (implementing fit/transform) that are chained, in the order in which they are chained, with the last object an instance of BaseNeutralizer.
    :param memory: Used to cache the fitted transformers of the pipeline.
    :param verbose: If True, the time elapsed while fitting each step will be printed as it is completed.
    :param predict_func: Name of the function that will be used for prediction.
    """

    def __init__(self, steps, *, transform_input=None, memory=None, verbose=False, predict_func="predict"):
        sklearn.set_config(enable_metadata_routing=True)
        self.predict_func = predict_func
        self.modified_steps = self.wrap_estimators_as_transformers(steps)
        self.steps = self.modified_steps
        self.transform_input = transform_input
        self.memory = memory
        self.verbose = verbose

    def wrap_estimators_as_transformers(self, steps):
        """
        Converts all estimator steps (except the last step) into transformers by wrapping them in MetaEstimator.
        :param steps: List of (name, transform) tuples specifying the pipeline steps.
        :return: Modified steps with all estimators wrapped as transformers.
        """
        transformed_steps = []
        for i, step_tuple in enumerate(steps):
            is_last_step = i == len(steps) - 1

            if len(step_tuple) == 3:
                name, step, columns = step_tuple
                transformed_steps.append(self._wrap_step(name, step, columns, is_last_step))
            else:
                name, step = step_tuple
                transformed_steps.append(self._wrap_step(name, step, is_last_step=is_last_step))
        return transformed_steps

    def _wrap_step(self, name, step, columns=None, is_last_step=False):
        """Recursive function to wrap steps"""
        # Recursive call
        if isinstance(step, (Pipeline, FeatureUnion, ColumnTransformer)):
            if isinstance(step, Pipeline):
                transformed = step.__class__(self.wrap_estimators_as_transformers(step.steps))
            elif isinstance(step, FeatureUnion):
                transformed = FeatureUnion(self.wrap_estimators_as_transformers(step.transformer_list))
            elif isinstance(step, ColumnTransformer):
                transformed_transformers = self.wrap_estimators_as_transformers(step.transformers)
                transformed = ColumnTransformer(transformed_transformers)
            return (name, transformed, columns) if columns else (name, transformed)

        # If it's the last step and it doesn't have a transform method, don't wrap it
        if is_last_step and not hasattr(step, "transform"):
            return (name, step, columns) if columns else (name, step)

        # Wrap estimator that has the predict function but not the transform function
        elif hasattr(step, self.predict_func) and not hasattr(step, "transform"):
            return (name, MetaEstimator(step, predict_func=self.predict_func))

        return (name, step, columns) if columns else (name, step)

`_wrap_step(name, step, columns=None, is_last_step=False)`

Recursive function to wrap steps

Source code in src/numerblox/meta.py

def _wrap_step(self, name, step, columns=None, is_last_step=False):
    """Recursive function to wrap steps"""
    # Recursive call
    if isinstance(step, (Pipeline, FeatureUnion, ColumnTransformer)):
        if isinstance(step, Pipeline):
            transformed = step.__class__(self.wrap_estimators_as_transformers(step.steps))
        elif isinstance(step, FeatureUnion):
            transformed = FeatureUnion(self.wrap_estimators_as_transformers(step.transformer_list))
        elif isinstance(step, ColumnTransformer):
            transformed_transformers = self.wrap_estimators_as_transformers(step.transformers)
            transformed = ColumnTransformer(transformed_transformers)
        return (name, transformed, columns) if columns else (name, transformed)

    # If it's the last step and it doesn't have a transform method, don't wrap it
    if is_last_step and not hasattr(step, "transform"):
        return (name, step, columns) if columns else (name, step)

    # Wrap estimator that has the predict function but not the transform function
    elif hasattr(step, self.predict_func) and not hasattr(step, "transform"):
        return (name, MetaEstimator(step, predict_func=self.predict_func))

    return (name, step, columns) if columns else (name, step)

`wrap_estimators_as_transformers(steps)`

Converts all estimator steps (except the last step) into transformers by wrapping them in MetaEstimator. :param steps: List of (name, transform) tuples specifying the pipeline steps. :return: Modified steps with all estimators wrapped as transformers.

Source code in src/numerblox/meta.py

def wrap_estimators_as_transformers(self, steps):
    """
    Converts all estimator steps (except the last step) into transformers by wrapping them in MetaEstimator.
    :param steps: List of (name, transform) tuples specifying the pipeline steps.
    :return: Modified steps with all estimators wrapped as transformers.
    """
    transformed_steps = []
    for i, step_tuple in enumerate(steps):
        is_last_step = i == len(steps) - 1

        if len(step_tuple) == 3:
            name, step, columns = step_tuple
            transformed_steps.append(self._wrap_step(name, step, columns, is_last_step))
        else:
            name, step = step_tuple
            transformed_steps.append(self._wrap_step(name, step, is_last_step=is_last_step))
    return transformed_steps

`make_meta_pipeline(*steps, memory=None, verbose=False)`

Convenience function for creating a MetaPipeline. :param steps: List of (name, transform) tuples (implementing fit/transform) that are chained, in the order in which they are chained, with the last object an instance of BaseNeutralizer. :param memory: Used to cache the fitted transformers of the pipeline. :param verbose: If True, the time elapsed while fitting each step will be printed as it is completed.

Source code in src/numerblox/meta.py

def make_meta_pipeline(*steps, memory=None, verbose=False) -> MetaPipeline:
    """
    Convenience function for creating a MetaPipeline.
    :param steps: List of (name, transform) tuples (implementing fit/transform) that are chained, in the order in which they are chained, with the last object an instance of BaseNeutralizer.
    :param memory: Used to cache the fitted transformers of the pipeline.
    :param verbose: If True, the time elapsed while fitting each step will be printed as it is completed.
    """
    return MetaPipeline(_name_estimators(steps), memory=memory, verbose=verbose)

Ensemble

`NumeraiEnsemble`

Bases: TransformerMixin, BaseEstimator

Ensembler that standardizes predictions by era and averages them. :param weights: Sequence of weights (float or int), optional, default: None. If None, then uniform weights are used. :param n_jobs: The number of jobs to run in parallel for fit. Will revert to 1 CPU core if not defined. -1 means using all processors. :param donate_weighted: Whether to use Donate et al.'s weighted average formula. Often used when ensembling predictions from multiple folds over time. Paper Link: https://doi.org/10.1016/j.neucom.2012.02.053 Example donate weighting for 5 folds: [0.0625, 0.0625, 0.125, 0.25, 0.5]

Source code in src/numerblox/ensemble.py

class NumeraiEnsemble(TransformerMixin, BaseEstimator):
    """
    Ensembler that standardizes predictions by era and averages them.
    :param weights: Sequence of weights (float or int), optional, default: None.
    If None, then uniform weights are used.
    :param n_jobs: The number of jobs to run in parallel for fit.
    Will revert to 1 CPU core if not defined.
    -1 means using all processors.
    :param donate_weighted: Whether to use Donate et al.'s weighted average formula.
    Often used when ensembling predictions from multiple folds over time.
    Paper Link: https://doi.org/10.1016/j.neucom.2012.02.053
    Example donate weighting for 5 folds: [0.0625, 0.0625, 0.125, 0.25, 0.5]
    """

    def __init__(self, weights=None, donate_weighted=False):
        sklearn.set_config(enable_metadata_routing=True)
        self.set_transform_request(era_series=True)
        self.set_predict_request(era_series=True)
        super().__init__()
        self.weights = weights
        if self.weights and sum(self.weights) != 1:
            warnings.warn(f"Warning: Weights do not sum to 1. Got {sum(self.weights)}.")
        self.donate_weighted = donate_weighted

    def fit(self, X: Union[np.array, pd.DataFrame], y=None):
        self.is_fitted_ = True
        return self

    def transform(self, X: Union[np.array, pd.DataFrame], era_series: pd.Series) -> np.array:
        """
        Standardize by era and ensemble.
        :param X: Input data where each column contains predictions from an estimator.
        :param era_series: Era labels (strings) for each row in X.
        :return: Ensembled predictions.
        """
        assert era_series is not None, "Era series must be provided for NumeraiEnsemble."
        assert len(X) == len(era_series), f"input X and era_series must have the same length. Got {len(X)} != {len(era_series)}."

        if len(X.shape) == 1:
            raise ValueError("NumeraiEnsemble requires at least 2 prediction columns. Got 1.")

        n_models = X.shape[1]
        if n_models <= 1:
            raise ValueError(f"NumeraiEnsemble requires at least 2 predictions columns. Got {len(n_models)}.")

        # Override weights if donate_weighted is True
        if self.donate_weighted:
            weights = self._get_donate_weights(n=n_models)
        else:
            weights = self.weights

        if isinstance(X, pd.DataFrame):
            X = X.values
        # Standardize predictions by era
        standardized_pred_list = []
        for i in range(n_models):
            # Skip standardization if all predictions are the same
            pred = X[:, i]
            if np.isnan(pred).any():
                warnings.warn(f"Warning: Some predictions in column '{i}' contain NaNs. Consider checking your estimators. Ensembled predictions will also be a NaN.")
            if np.all(pred == pred[0]):
                warnings.warn(f"Warning: Predictions in column '{i}' are all constant. Consider checking your estimators. Skipping these estimator predictions in ensembling.")
            else:
                standardized_pred = self._standardize_by_era(pred, era_series)
                standardized_pred_list.append(standardized_pred)
        standardized_pred_arr = np.asarray(standardized_pred_list).T

        if not standardized_pred_list:
            raise ValueError("Predictions for all columns are constant. No valid predictions to ensemble.")

        # Average out predictions
        ensembled_predictions = np.average(standardized_pred_arr, axis=1, weights=weights)
        return ensembled_predictions.reshape(-1, 1)

    def fit_transform(self, X: Union[np.array, pd.DataFrame], y=None, era_series: pd.Series = None) -> np.array:
        self.fit(X, y)
        return self.transform(X, era_series)

    def predict(self, X: Union[np.array, pd.DataFrame], era_series: pd.Series) -> np.array:
        """
        For if a NumeraiEnsemble happens to be the last step in the pipeline. Has same behavior as transform.
        """
        return self.transform(X, era_series=era_series)

    def _standardize(self, X: np.array) -> np.array:
        """
        Standardize single era.
        :param X: Predictions for a single era.
        :return: Standardized predictions.
        """
        percentile_X = (scipy.stats.rankdata(X, method="ordinal") - 0.5) / len(X)
        return percentile_X

    def _standardize_by_era(self, X: np.array, era_series: Union[np.array, pd.Series, pd.DataFrame]) -> np.array:
        """
        Standardize predictions of a single estimator by era.
        :param X: All predictions of a single estimator.
        :param era_series: Era labels (strings) for each row in X.
        :return: Standardized predictions.
        """
        if isinstance(era_series, (pd.Series, pd.DataFrame)):
            era_series = era_series.to_numpy().flatten()
        df = pd.DataFrame({"prediction": X, "era": era_series})
        df["standardized_prediction"] = df.groupby("era")["prediction"].transform(self._standardize)
        return df["standardized_prediction"].values.flatten()

    def _get_donate_weights(self, n: int) -> list:
        """
        Exponential weights as per Donate et al.'s formula.
        Example donate weighting for 3 folds: [0.25, 0.25, 0.5]
        Example donate weighting for 5 folds: [0.0625, 0.0625, 0.125, 0.25, 0.5]

        :param n: Number of estimators.
        :return: List of weights.
        """
        weights = []
        for j in range(1, n + 1):
            j = 2 if j == 1 else j
            weights.append(1 / (2 ** (n + 1 - j)))
        return weights

    def get_feature_names_out(self, input_features=None) -> List[str]:
        return ["numerai_ensemble_predictions"] if not input_features else input_features

`_get_donate_weights(n)`

Exponential weights as per Donate et al.'s formula. Example donate weighting for 3 folds: [0.25, 0.25, 0.5] Example donate weighting for 5 folds: [0.0625, 0.0625, 0.125, 0.25, 0.5]

:param n: Number of estimators. :return: List of weights.

Source code in src/numerblox/ensemble.py

def _get_donate_weights(self, n: int) -> list:
    """
    Exponential weights as per Donate et al.'s formula.
    Example donate weighting for 3 folds: [0.25, 0.25, 0.5]
    Example donate weighting for 5 folds: [0.0625, 0.0625, 0.125, 0.25, 0.5]

    :param n: Number of estimators.
    :return: List of weights.
    """
    weights = []
    for j in range(1, n + 1):
        j = 2 if j == 1 else j
        weights.append(1 / (2 ** (n + 1 - j)))
    return weights

`_standardize(X)`

Standardize single era. :param X: Predictions for a single era. :return: Standardized predictions.

Source code in src/numerblox/ensemble.py

def _standardize(self, X: np.array) -> np.array:
    """
    Standardize single era.
    :param X: Predictions for a single era.
    :return: Standardized predictions.
    """
    percentile_X = (scipy.stats.rankdata(X, method="ordinal") - 0.5) / len(X)
    return percentile_X

`_standardize_by_era(X, era_series)`

Standardize predictions of a single estimator by era. :param X: All predictions of a single estimator. :param era_series: Era labels (strings) for each row in X. :return: Standardized predictions.

Source code in src/numerblox/ensemble.py

def _standardize_by_era(self, X: np.array, era_series: Union[np.array, pd.Series, pd.DataFrame]) -> np.array:
    """
    Standardize predictions of a single estimator by era.
    :param X: All predictions of a single estimator.
    :param era_series: Era labels (strings) for each row in X.
    :return: Standardized predictions.
    """
    if isinstance(era_series, (pd.Series, pd.DataFrame)):
        era_series = era_series.to_numpy().flatten()
    df = pd.DataFrame({"prediction": X, "era": era_series})
    df["standardized_prediction"] = df.groupby("era")["prediction"].transform(self._standardize)
    return df["standardized_prediction"].values.flatten()

`predict(X, era_series)`

For if a NumeraiEnsemble happens to be the last step in the pipeline. Has same behavior as transform.

Source code in src/numerblox/ensemble.py

def predict(self, X: Union[np.array, pd.DataFrame], era_series: pd.Series) -> np.array:
    """
    For if a NumeraiEnsemble happens to be the last step in the pipeline. Has same behavior as transform.
    """
    return self.transform(X, era_series=era_series)

`transform(X, era_series)`

Standardize by era and ensemble. :param X: Input data where each column contains predictions from an estimator. :param era_series: Era labels (strings) for each row in X. :return: Ensembled predictions.

Source code in src/numerblox/ensemble.py

def transform(self, X: Union[np.array, pd.DataFrame], era_series: pd.Series) -> np.array:
    """
    Standardize by era and ensemble.
    :param X: Input data where each column contains predictions from an estimator.
    :param era_series: Era labels (strings) for each row in X.
    :return: Ensembled predictions.
    """
    assert era_series is not None, "Era series must be provided for NumeraiEnsemble."
    assert len(X) == len(era_series), f"input X and era_series must have the same length. Got {len(X)} != {len(era_series)}."

    if len(X.shape) == 1:
        raise ValueError("NumeraiEnsemble requires at least 2 prediction columns. Got 1.")

    n_models = X.shape[1]
    if n_models <= 1:
        raise ValueError(f"NumeraiEnsemble requires at least 2 predictions columns. Got {len(n_models)}.")

    # Override weights if donate_weighted is True
    if self.donate_weighted:
        weights = self._get_donate_weights(n=n_models)
    else:
        weights = self.weights

    if isinstance(X, pd.DataFrame):
        X = X.values
    # Standardize predictions by era
    standardized_pred_list = []
    for i in range(n_models):
        # Skip standardization if all predictions are the same
        pred = X[:, i]
        if np.isnan(pred).any():
            warnings.warn(f"Warning: Some predictions in column '{i}' contain NaNs. Consider checking your estimators. Ensembled predictions will also be a NaN.")
        if np.all(pred == pred[0]):
            warnings.warn(f"Warning: Predictions in column '{i}' are all constant. Consider checking your estimators. Skipping these estimator predictions in ensembling.")
        else:
            standardized_pred = self._standardize_by_era(pred, era_series)
            standardized_pred_list.append(standardized_pred)
    standardized_pred_arr = np.asarray(standardized_pred_list).T

    if not standardized_pred_list:
        raise ValueError("Predictions for all columns are constant. No valid predictions to ensemble.")

    # Average out predictions
    ensembled_predictions = np.average(standardized_pred_arr, axis=1, weights=weights)
    return ensembled_predictions.reshape(-1, 1)

`PredictionReducer`

Bases: TransformerMixin, BaseEstimator

Reduce multiclassification and proba preds to 1 column per model. If predictions were generated with a regressor or regular predict you don't need this step. :param n_models: Number of resulting columns. This indicates how many models were trained to generate the prediction array. :param n_classes: Number of classes for each prediction. If predictions were generated with predict_proba and binary classification -> n_classes = 2.

Source code in src/numerblox/ensemble.py

class PredictionReducer(TransformerMixin, BaseEstimator):
    """
    Reduce multiclassification and proba preds to 1 column per model.
    If predictions were generated with a regressor or regular predict you don't need this step.
    :param n_models: Number of resulting columns.
    This indicates how many models were trained to generate the prediction array.
    :param n_classes: Number of classes for each prediction.
    If predictions were generated with predict_proba and binary classification -> n_classes = 2.
    """

    def __init__(self, n_models: int, n_classes: int):
        super().__init__()
        if n_models < 1:
            raise ValueError(f"n_models must be >= 1. Got '{n_models}'.")
        self.n_models = n_models
        if n_classes < 2:
            raise ValueError(f"n_classes must be >= 2. If n_classes = 1 you don't need PredictionReducer. Got '{n_classes}'.")
        self.n_classes = n_classes
        self.dot_array = [i for i in range(self.n_classes)]

    def fit(self, X: np.array, y=None):
        return self

    def transform(self, X: np.array):
        """
        :param X: Input predictions.
        :return: Reduced predictions of shape (X.shape[0], self.n_models).
        """
        reduced = []
        expected_n_cols = self.n_models * self.n_classes
        if len(X.shape) != 2:
            raise ValueError(f"Expected X to be a 2D array. Got '{len(X.shape)}' dimension(s).")
        if X.shape[1] != expected_n_cols:
            raise ValueError(f"Input X must have {expected_n_cols} columns. Got {X.shape[1]} columns while n_models={self.n_models} * n_classes={self.n_classes} = {expected_n_cols}. ")
        for i in range(self.n_models):
            # Extracting the predictions of the i-th model
            model_preds = X[:, i * self.n_classes : (i + 1) * self.n_classes]
            r = model_preds @ self.dot_array
            reduced.append(r)
        reduced_arr = np.column_stack(reduced)
        return reduced_arr

    def predict(self, X: np.array):
        """
        For if PredictionReducer happens to be the last step in the pipeline. Has same behavior as transform.
        :param X: Input predictions.
        :return: Reduced predictions of shape (X.shape[0], self.n_models).
        """
        return self.transform(X)

    def get_feature_names_out(self, input_features=None) -> List[str]:
        return [f"reduced_prediction_{i}" for i in range(self.n_models)] if not input_features else input_features

`predict(X)`

For if PredictionReducer happens to be the last step in the pipeline. Has same behavior as transform. :param X: Input predictions. :return: Reduced predictions of shape (X.shape[0], self.n_models).

Source code in src/numerblox/ensemble.py

def predict(self, X: np.array):
    """
    For if PredictionReducer happens to be the last step in the pipeline. Has same behavior as transform.
    :param X: Input predictions.
    :return: Reduced predictions of shape (X.shape[0], self.n_models).
    """
    return self.transform(X)

`transform(X)`

:param X: Input predictions. :return: Reduced predictions of shape (X.shape[0], self.n_models).

Source code in src/numerblox/ensemble.py

def transform(self, X: np.array):
    """
    :param X: Input predictions.
    :return: Reduced predictions of shape (X.shape[0], self.n_models).
    """
    reduced = []
    expected_n_cols = self.n_models * self.n_classes
    if len(X.shape) != 2:
        raise ValueError(f"Expected X to be a 2D array. Got '{len(X.shape)}' dimension(s).")
    if X.shape[1] != expected_n_cols:
        raise ValueError(f"Input X must have {expected_n_cols} columns. Got {X.shape[1]} columns while n_models={self.n_models} * n_classes={self.n_classes} = {expected_n_cols}. ")
    for i in range(self.n_models):
        # Extracting the predictions of the i-th model
        model_preds = X[:, i * self.n_classes : (i + 1) * self.n_classes]
        r = model_preds @ self.dot_array
        reduced.append(r)
    reduced_arr = np.column_stack(reduced)
    return reduced_arr

Neutralizers

`BaseNeutralizer`

Bases: TransformerMixin, BaseEstimator

Base class for neutralization so it is compatible with scikit-learn. :param new_col_name: Name of new neutralized column.

Source code in src/numerblox/neutralizers.py

class BaseNeutralizer(TransformerMixin, BaseEstimator):
    """
    Base class for neutralization so it is compatible with scikit-learn.
    :param new_col_name: Name of new neutralized column.
    """

    def __init__(self, new_col_names: list):
        self.new_col_names = new_col_names
        sklearn.set_config(enable_metadata_routing=True)
        self.set_transform_request(features=True, era_series=True)
        self.set_predict_request(features=True, era_series=True)
        super().__init__()

    def fit(self, X=None, y=None):
        return self

    @abstractmethod
    def transform(self, X: Union[np.array, pd.DataFrame], features: pd.DataFrame, era_series: pd.Series) -> np.array: ...

    def predict(self, X: np.array, features: pd.DataFrame, era_series: Union[np.array, pd.Series] = None) -> np.array:
        """Convenience function for scikit-learn compatibility."""
        return self.transform(X=X, features=features, era_series=era_series)

    def fit_transform(self, X: np.array, features: pd.DataFrame, era_series: Union[np.array, pd.Series] = None) -> np.array:
        """
        Convenience function for scikit-learn compatibility.
        Needed because fit and transform except different arguments here.
        """
        return self.fit().transform(X=X, features=features, era_series=era_series)

    def get_feature_names_out(self, input_features: list = None) -> list:
        """
        Get feature names for neutralized output.

        :param input_features: Optional list of input feature names.
        :return: List of feature names for neutralized output.
        """
        return input_features if input_features else self.new_col_names

`fit_transform(X, features, era_series=None)`

Convenience function for scikit-learn compatibility. Needed because fit and transform except different arguments here.

Source code in src/numerblox/neutralizers.py

def fit_transform(self, X: np.array, features: pd.DataFrame, era_series: Union[np.array, pd.Series] = None) -> np.array:
    """
    Convenience function for scikit-learn compatibility.
    Needed because fit and transform except different arguments here.
    """
    return self.fit().transform(X=X, features=features, era_series=era_series)

`get_feature_names_out(input_features=None)`

Get feature names for neutralized output.

:param input_features: Optional list of input feature names. :return: List of feature names for neutralized output.

Source code in src/numerblox/neutralizers.py

def get_feature_names_out(self, input_features: list = None) -> list:
    """
    Get feature names for neutralized output.

    :param input_features: Optional list of input feature names.
    :return: List of feature names for neutralized output.
    """
    return input_features if input_features else self.new_col_names

`predict(X, features, era_series=None)`

Convenience function for scikit-learn compatibility.

Source code in src/numerblox/neutralizers.py

def predict(self, X: np.array, features: pd.DataFrame, era_series: Union[np.array, pd.Series] = None) -> np.array:
    """Convenience function for scikit-learn compatibility."""
    return self.transform(X=X, features=features, era_series=era_series)

`FeatureNeutralizer`

Bases: BaseNeutralizer

Classic feature neutralization by subtracting a linear model.

:param pred_name: Name of prediction column. For creating the new column name. :param proportion: Number in range [0...1] indicating how much to neutralize. :param suffix: Optional suffix that is added to new column name. :param num_cores: Number of cores to use for parallel processing. By default, all CPU cores are used.

Source code in src/numerblox/neutralizers.py

class FeatureNeutralizer(BaseNeutralizer):
    """
    Classic feature neutralization by subtracting a linear model.

    :param pred_name: Name of prediction column. For creating the new column name.
    :param proportion: Number in range [0...1] indicating how much to neutralize.
    :param suffix: Optional suffix that is added to new column name.
    :param num_cores: Number of cores to use for parallel processing.
    By default, all CPU cores are used.
    """

    def __init__(self, pred_name: Union[str, list] = "prediction", proportion: Union[float, List[float]] = 0.5, suffix: str = None, num_cores: int = -1):
        self.pred_name = [pred_name] if isinstance(pred_name, str) else pred_name
        self.proportion = [proportion] if isinstance(proportion, float) else proportion
        assert len(self.pred_name) == len(set(self.pred_name)), "Duplicate 'pred_names' found. Make sure all names are unique."
        assert len(self.proportion) == len(set(self.proportion)), "Duplicate 'proportions' found. Make sure all proportions are unique."
        for prop in self.proportion:
            assert 0.0 <= prop <= 1.0, f"'proportion' should be a float in range [0...1]. Got '{prop}'."

        new_col_names = []
        for pred_name in self.pred_name:
            for prop in self.proportion:
                new_col_names.append(f"{pred_name}_neutralized_{prop}_{suffix}" if suffix else f"{pred_name}_neutralized_{prop}")
        super().__init__(new_col_names=new_col_names)
        self.suffix = suffix
        self.num_cores = num_cores

    def transform(self, X: Union[np.array, pd.Series, pd.DataFrame], features: pd.DataFrame, era_series: Union[np.array, pd.Series] = None) -> np.array:
        """
        Main transform function.
        :param X: Input predictions to neutralize. \n
        :param features: DataFrame with features for neutralization. \n
        :param era_series: Series with era labels for each row in features. \n
        Features, era_series and the prediction column must all have the same length.
        :return: Neutralized predictions NumPy array.
        """
        if era_series is None:
            warnings.warn("""WARNING: 'era_series' not provided for 
                          neutralization! Neutralization will be treated as if 'X' is 1 era of data. Ensure you are not passing multiple eras to neutralization in this way! Not providing 'era_series' is valid for live inference, where only one era is used to generate predictions.""")
        else:
            assert len(X) == len(era_series), "Input predictions must have same length as era_series."
        assert len(X) == len(features), "Input predictions must have same length as features."

        df = features.copy()
        if not isinstance(X, np.ndarray):
            X = np.array(X)
        # Ensure X is a 2D array and has the same number of columns as pred_name
        if X.ndim == 1:
            assert len(self.pred_name) == 1, "Only one prediction column found. Please input a 2D array or define one column for 'pred_name'."
            X = X.reshape(-1, 1)
        else:
            assert len(self.pred_name) == X.shape[1], "Number of prediction columns given in X does not match 'pred_name'."
        for i, pred_name in enumerate(self.pred_name):
            df[pred_name] = X[:, i]
        # Treat input as 1 era if era_series is not provided.
        df["era"] = era_series if era_series is not None else "X"

        feature_cols = list(features.columns)
        tasks = [delayed(self._process_pred_name)(df, pred_name, proportion, feature_cols) for pred_name in tqdm(self.pred_name, desc="Processing feature neutralizations") for proportion in self.proportion]
        neutralized_results = Parallel(n_jobs=self.num_cores)(tasks)
        neutralized_preds = pd.concat(neutralized_results, axis=1).to_numpy()
        return neutralized_preds

    def _process_pred_name(self, df: pd.DataFrame, pred_name: str, proportion: float, feature_cols: List[str]) -> pd.DataFrame:
        """
        Process one combination of prediction and proportion.
        :param df: DataFrame with features and predictions.
        :param pred_name: Name of prediction column.
        :param proportion: Proportion to neutralize.
        :param feature_cols: List of feature column names.
        :return: Neutralized predictions.
        Neutralized predictions are scaled to [0...1].
        """
        neutralized_pred = df.groupby("era", group_keys=False).apply(lambda x: self.normalize_and_neutralize(x, [pred_name], feature_cols, proportion))
        return pd.DataFrame(MinMaxScaler().fit_transform(neutralized_pred))

    def neutralize(self, dataf: pd.DataFrame, columns: list, by: list, proportion: float) -> pd.DataFrame:
        """
        Neutralize on CPU.
        :param dataf: DataFrame with features and predictions.
        :param columns: List of prediction column names.
        :param by: List of feature column names.
        :param proportion: Proportion to neutralize.
        :return: Neutralized predictions.
        """
        scores = dataf[columns]
        exposures = dataf[by].values
        scores = scores - proportion * self._get_raw_exposures(exposures, scores)
        return scores / scores.std()

    @staticmethod
    def normalize(dataf: pd.DataFrame) -> np.ndarray:
        """Normalize predictions.
        1. Rank predictions.
        2. Normalize ranks.
        3. Gaussianize ranks.
        :param dataf: DataFrame with predictions.
        :return: Gaussianized rank predictions.
        """
        normalized_ranks = (dataf.rank(method="first") - 0.5) / len(dataf)
        # Gaussianized ranks
        return sp.norm.ppf(normalized_ranks)

    def normalize_and_neutralize(self, dataf: pd.DataFrame, columns: list, by: list, proportion: float) -> pd.DataFrame:
        """
        Gaussianize predictions and neutralize with one combination of prediction and proportion.
        :param dataf: DataFrame with features and predictions.
        :param columns: List of prediction column names.
        :param by: List of feature column names.
        :param proportion: Proportion to neutralize.
        :return: Neutralized predictions DataFrame.
        """
        dataf[columns] = self.normalize(dataf[columns])
        dataf[columns] = self.neutralize(dataf, columns, by, proportion)
        return dataf[columns]

    @staticmethod
    def _get_raw_exposures(exposures: np.array, scores: pd.DataFrame) -> np.array:
        """
        Get raw feature exposures.
        Make sure predictions are normalized!
        :param exposures: Exposures for each era.
        :param scores: DataFrame with predictions.
        :return: Raw exposures for each era.
        """
        return exposures @ np.linalg.lstsq(exposures, scores.values, rcond=None)[0]

`_get_raw_exposures(exposures, scores)` `staticmethod`

Get raw feature exposures. Make sure predictions are normalized! :param exposures: Exposures for each era. :param scores: DataFrame with predictions. :return: Raw exposures for each era.

Source code in src/numerblox/neutralizers.py

@staticmethod
def _get_raw_exposures(exposures: np.array, scores: pd.DataFrame) -> np.array:
    """
    Get raw feature exposures.
    Make sure predictions are normalized!
    :param exposures: Exposures for each era.
    :param scores: DataFrame with predictions.
    :return: Raw exposures for each era.
    """
    return exposures @ np.linalg.lstsq(exposures, scores.values, rcond=None)[0]

`_process_pred_name(df, pred_name, proportion, feature_cols)`

Process one combination of prediction and proportion. :param df: DataFrame with features and predictions. :param pred_name: Name of prediction column. :param proportion: Proportion to neutralize. :param feature_cols: List of feature column names. :return: Neutralized predictions. Neutralized predictions are scaled to [0...1].

Source code in src/numerblox/neutralizers.py

def _process_pred_name(self, df: pd.DataFrame, pred_name: str, proportion: float, feature_cols: List[str]) -> pd.DataFrame:
    """
    Process one combination of prediction and proportion.
    :param df: DataFrame with features and predictions.
    :param pred_name: Name of prediction column.
    :param proportion: Proportion to neutralize.
    :param feature_cols: List of feature column names.
    :return: Neutralized predictions.
    Neutralized predictions are scaled to [0...1].
    """
    neutralized_pred = df.groupby("era", group_keys=False).apply(lambda x: self.normalize_and_neutralize(x, [pred_name], feature_cols, proportion))
    return pd.DataFrame(MinMaxScaler().fit_transform(neutralized_pred))

`neutralize(dataf, columns, by, proportion)`

Neutralize on CPU. :param dataf: DataFrame with features and predictions. :param columns: List of prediction column names. :param by: List of feature column names. :param proportion: Proportion to neutralize. :return: Neutralized predictions.

Source code in src/numerblox/neutralizers.py

def neutralize(self, dataf: pd.DataFrame, columns: list, by: list, proportion: float) -> pd.DataFrame:
    """
    Neutralize on CPU.
    :param dataf: DataFrame with features and predictions.
    :param columns: List of prediction column names.
    :param by: List of feature column names.
    :param proportion: Proportion to neutralize.
    :return: Neutralized predictions.
    """
    scores = dataf[columns]
    exposures = dataf[by].values
    scores = scores - proportion * self._get_raw_exposures(exposures, scores)
    return scores / scores.std()

`normalize(dataf)` `staticmethod`

Normalize predictions. 1. Rank predictions. 2. Normalize ranks. 3. Gaussianize ranks. :param dataf: DataFrame with predictions. :return: Gaussianized rank predictions.

Source code in src/numerblox/neutralizers.py

@staticmethod
def normalize(dataf: pd.DataFrame) -> np.ndarray:
    """Normalize predictions.
    1. Rank predictions.
    2. Normalize ranks.
    3. Gaussianize ranks.
    :param dataf: DataFrame with predictions.
    :return: Gaussianized rank predictions.
    """
    normalized_ranks = (dataf.rank(method="first") - 0.5) / len(dataf)
    # Gaussianized ranks
    return sp.norm.ppf(normalized_ranks)

`normalize_and_neutralize(dataf, columns, by, proportion)`

Gaussianize predictions and neutralize with one combination of prediction and proportion. :param dataf: DataFrame with features and predictions. :param columns: List of prediction column names. :param by: List of feature column names. :param proportion: Proportion to neutralize. :return: Neutralized predictions DataFrame.

Source code in src/numerblox/neutralizers.py

def normalize_and_neutralize(self, dataf: pd.DataFrame, columns: list, by: list, proportion: float) -> pd.DataFrame:
    """
    Gaussianize predictions and neutralize with one combination of prediction and proportion.
    :param dataf: DataFrame with features and predictions.
    :param columns: List of prediction column names.
    :param by: List of feature column names.
    :param proportion: Proportion to neutralize.
    :return: Neutralized predictions DataFrame.
    """
    dataf[columns] = self.normalize(dataf[columns])
    dataf[columns] = self.neutralize(dataf, columns, by, proportion)
    return dataf[columns]

`transform(X, features, era_series=None)`

Main transform function. :param X: Input predictions to neutralize.

:param features: DataFrame with features for neutralization.

:param era_series: Series with era labels for each row in features.

Features, era_series and the prediction column must all have the same length. :return: Neutralized predictions NumPy array.

Source code in src/numerblox/neutralizers.py

def transform(self, X: Union[np.array, pd.Series, pd.DataFrame], features: pd.DataFrame, era_series: Union[np.array, pd.Series] = None) -> np.array:
    """
    Main transform function.
    :param X: Input predictions to neutralize. \n
    :param features: DataFrame with features for neutralization. \n
    :param era_series: Series with era labels for each row in features. \n
    Features, era_series and the prediction column must all have the same length.
    :return: Neutralized predictions NumPy array.
    """
    if era_series is None:
        warnings.warn("""WARNING: 'era_series' not provided for 
                      neutralization! Neutralization will be treated as if 'X' is 1 era of data. Ensure you are not passing multiple eras to neutralization in this way! Not providing 'era_series' is valid for live inference, where only one era is used to generate predictions.""")
    else:
        assert len(X) == len(era_series), "Input predictions must have same length as era_series."
    assert len(X) == len(features), "Input predictions must have same length as features."

    df = features.copy()
    if not isinstance(X, np.ndarray):
        X = np.array(X)
    # Ensure X is a 2D array and has the same number of columns as pred_name
    if X.ndim == 1:
        assert len(self.pred_name) == 1, "Only one prediction column found. Please input a 2D array or define one column for 'pred_name'."
        X = X.reshape(-1, 1)
    else:
        assert len(self.pred_name) == X.shape[1], "Number of prediction columns given in X does not match 'pred_name'."
    for i, pred_name in enumerate(self.pred_name):
        df[pred_name] = X[:, i]
    # Treat input as 1 era if era_series is not provided.
    df["era"] = era_series if era_series is not None else "X"

    feature_cols = list(features.columns)
    tasks = [delayed(self._process_pred_name)(df, pred_name, proportion, feature_cols) for pred_name in tqdm(self.pred_name, desc="Processing feature neutralizations") for proportion in self.proportion]
    neutralized_results = Parallel(n_jobs=self.num_cores)(tasks)
    neutralized_preds = pd.concat(neutralized_results, axis=1).to_numpy()
    return neutralized_preds

Penalizers

`BasePenalizer`

Bases: TransformerMixin, BaseEstimator

Base class for penalization so it is compatible with scikit-learn. :param new_col_name: Name of new neutralized column.

Source code in src/numerblox/penalizers.py

class BasePenalizer(TransformerMixin, BaseEstimator):
    """
    Base class for penalization so it is compatible with scikit-learn.
    :param new_col_name: Name of new neutralized column.
    """

    def __init__(self, new_col_name: str):
        sklearn.set_config(enable_metadata_routing=True)
        self.set_transform_request(features=True, era_series=True)
        self.set_predict_request(features=True, era_series=True)
        self.new_col_name = new_col_name
        super().__init__()

    def fit(self, X=None, y=None):
        return self

    @abstractmethod
    def transform(self, X: Union[np.array, pd.DataFrame], features: pd.DataFrame, era_series: pd.Series) -> np.array: ...

    def predict(self, X: np.array, features: pd.DataFrame, era_series: Union[np.array, pd.Series]) -> np.array:
        """Convenience function for scikit-learn compatibility."""
        return self.transform(X=X, features=features, era_series=era_series)

    def fit_transform(self, X: np.array, features: pd.DataFrame, era_series: Union[np.array, pd.Series]) -> np.array:
        """
        Convenience function for scikit-learn compatibility.
        Needed because fit and transform except different arguments here.
        """
        return self.fit().transform(X=X, features=features, era_series=era_series)

    def get_feature_names_out(self, input_features: list = None) -> list:
        """
        Get feature names for neutralized output.

        :param input_features: Optional list of input feature names.
        :return: List of feature names for neutralized output.
        """
        return input_features if input_features else [self.new_col_name]

`fit_transform(X, features, era_series)`

Convenience function for scikit-learn compatibility. Needed because fit and transform except different arguments here.

Source code in src/numerblox/penalizers.py

def fit_transform(self, X: np.array, features: pd.DataFrame, era_series: Union[np.array, pd.Series]) -> np.array:
    """
    Convenience function for scikit-learn compatibility.
    Needed because fit and transform except different arguments here.
    """
    return self.fit().transform(X=X, features=features, era_series=era_series)

`get_feature_names_out(input_features=None)`

Get feature names for neutralized output.

:param input_features: Optional list of input feature names. :return: List of feature names for neutralized output.

Source code in src/numerblox/penalizers.py

def get_feature_names_out(self, input_features: list = None) -> list:
    """
    Get feature names for neutralized output.

    :param input_features: Optional list of input feature names.
    :return: List of feature names for neutralized output.
    """
    return input_features if input_features else [self.new_col_name]

`predict(X, features, era_series)`

Convenience function for scikit-learn compatibility.

Source code in src/numerblox/penalizers.py

def predict(self, X: np.array, features: pd.DataFrame, era_series: Union[np.array, pd.Series]) -> np.array:
    """Convenience function for scikit-learn compatibility."""
    return self.transform(X=X, features=features, era_series=era_series)

`FeaturePenalizer`

Bases: BasePenalizer

Feature penalization with TensorFlow.

Source (by jrb): https://github.com/jonrtaylor/twitch/blob/master/FE_Clipping_Script.ipynb

Source of first PyTorch implementation (by Michael Oliver / mdo): https://forum.numer.ai/t/model-diagnostics-feature-exposure/899/12

:param max_exposure: Number in range [0...1] indicating how much to reduce max feature exposure to. :param pred_name: Prediction column name. Used for new column name.

:param suffix: Optional suffix that is added to new column name.

Source code in src/numerblox/penalizers.py

class FeaturePenalizer(BasePenalizer):
    """
    Feature penalization with TensorFlow.

    Source (by jrb): https://github.com/jonrtaylor/twitch/blob/master/FE_Clipping_Script.ipynb

    Source of first PyTorch implementation (by Michael Oliver / mdo): https://forum.numer.ai/t/model-diagnostics-feature-exposure/899/12

    :param max_exposure: Number in range [0...1] indicating how much to reduce max feature exposure to.
    :param pred_name: Prediction column name. Used for new column name. \n
    :param suffix: Optional suffix that is added to new column name.
    """

    def __init__(
        self,
        max_exposure: float,
        pred_name: str = "prediction",
        suffix: str = None,
    ):
        self.max_exposure = max_exposure
        self.pred_name = pred_name
        assert 0.0 <= max_exposure <= 1.0, f"'max_exposure' should be a float in range [0...1]. Got '{self.max_exposure}'."
        new_col_name = f"{self.pred_name}_penalized_{self.max_exposure}_{suffix}" if suffix else f"{self.pred_name}_penalized_{self.max_exposure}"
        super().__init__(new_col_name=new_col_name)
        self.suffix = suffix

    def transform(self, X: pd.DataFrame, features: pd.DataFrame, era_series: pd.Series) -> np.array:
        """
        Main transform method.
        :param X: Input predictions to neutralize.
        :param features: DataFrame with features for neutralization.
        :param era_series: Series with era labels for each row in features.
        Features, eras and the prediction column must all have the same length.
        :return: Penalized predictions.
        """
        assert len(X) == len(features), "Input predictions must have same length as features."
        assert len(X) == len(era_series), "Input predictions must have same length as eras."
        df = features.copy()
        df["prediction"] = X
        df["era"] = era_series
        penalized_data = self._reduce_all_exposures(dataf=df, column=self.pred_name, neutralizers=list(features.columns))
        return penalized_data

    def _reduce_all_exposures(
        self,
        dataf: pd.DataFrame,
        column: str = "prediction",
        neutralizers: list = None,
        normalize=True,
        gaussianize=True,
    ) -> pd.DataFrame:
        neutralized = []

        for era in tqdm(dataf["era"].unique()):
            dataf_era = dataf[dataf["era"] == era]
            scores = dataf_era[[column]].values
            exposure_values = dataf_era[neutralizers].values

            if normalize:
                scores2 = []
                for x in scores.T:
                    x = (scipy.stats.rankdata(x, method="ordinal") - 0.5) / len(x)
                    if gaussianize:
                        x = scipy.stats.norm.ppf(x)
                    scores2.append(x)
                scores = np.array(scores2)[0]

            scores, _ = self._reduce_exposure(scores, exposure_values, len(neutralizers), None)

            scores /= tf.math.reduce_std(scores)
            scores -= tf.reduce_min(scores)
            scores /= tf.reduce_max(scores)
            neutralized.append(scores.numpy())

        predictions = pd.DataFrame(np.concatenate(neutralized), columns=[column], index=dataf.index)
        return predictions

    def _reduce_exposure(self, prediction, features, input_size=50, weights=None):
        model = tf.keras.models.Sequential(
            [
                tf.keras.layers.Input(input_size),
                tf.keras.experimental.LinearModel(use_bias=False),
            ]
        )
        feats = tf.convert_to_tensor(features - 0.5, dtype=tf.float32)
        pred = tf.convert_to_tensor(prediction, dtype=tf.float32)
        if weights is None:
            optimizer = tf.keras.optimizers.Adamax()
            start_exp = self.__exposures(feats, pred[:, None])
            target_exps = tf.clip_by_value(start_exp, -self.max_exposure, self.max_exposure)
            self._train_loop(model, optimizer, feats, pred, target_exps)
        else:
            model.set_weights(weights)
        return pred[:, None] - model(feats), model.get_weights()

    def _train_loop(self, model, optimizer, feats, pred, target_exps):
        for _ in range(1000000):
            loss, grads = self.__train_loop_body(model, feats, pred, target_exps)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))
            if loss < 1e-7:
                break

    def __train_loop_body(self, model, feats, pred, target_exps):
        with tf.GradientTape() as tape:
            exps = self.__exposures(feats, pred[:, None] - model(feats, training=True))
            loss = tf.reduce_sum(tf.nn.relu(tf.nn.relu(exps) - tf.nn.relu(target_exps)) + tf.nn.relu(tf.nn.relu(-exps) - tf.nn.relu(-target_exps)))
        return loss, tape.gradient(loss, model.trainable_variables)

    @staticmethod
    def __exposures(x, y):
        x = x - tf.math.reduce_mean(x, axis=0)
        x = x / tf.norm(x, axis=0)
        y = y - tf.math.reduce_mean(y, axis=0)
        y = y / tf.norm(y, axis=0)
        return tf.matmul(x, y, transpose_a=True)

`transform(X, features, era_series)`

Main transform method. :param X: Input predictions to neutralize. :param features: DataFrame with features for neutralization. :param era_series: Series with era labels for each row in features. Features, eras and the prediction column must all have the same length. :return: Penalized predictions.

Source code in src/numerblox/penalizers.py

def transform(self, X: pd.DataFrame, features: pd.DataFrame, era_series: pd.Series) -> np.array:
    """
    Main transform method.
    :param X: Input predictions to neutralize.
    :param features: DataFrame with features for neutralization.
    :param era_series: Series with era labels for each row in features.
    Features, eras and the prediction column must all have the same length.
    :return: Penalized predictions.
    """
    assert len(X) == len(features), "Input predictions must have same length as features."
    assert len(X) == len(era_series), "Input predictions must have same length as eras."
    df = features.copy()
    df["prediction"] = X
    df["era"] = era_series
    penalized_data = self._reduce_all_exposures(dataf=df, column=self.pred_name, neutralizers=list(features.columns))
    return penalized_data

Prediction Loaders

`BasePredictionLoader`

Bases: TransformerMixin, BaseEstimator

Shared functionality for all Prediction Loaders.

Source code in src/numerblox/prediction_loaders.py

class BasePredictionLoader(TransformerMixin, BaseEstimator):
    """Shared functionality for all Prediction Loaders."""

    def __init__(self): ...

    def fit(self, X=None, y=None):
        return self

    @abstractmethod
    def transform(self, X=None, y=None) -> pd.DataFrame:
        """Return Predictions generated by model."""
        ...

    @abstractmethod
    def get_feature_names_out(self, input_features=None):
        """Return feature names."""
        ...

`get_feature_names_out(input_features=None)` `abstractmethod`

Return feature names.

Source code in src/numerblox/prediction_loaders.py

@abstractmethod
def get_feature_names_out(self, input_features=None):
    """Return feature names."""
    ...

`transform(X=None, y=None)` `abstractmethod`

Return Predictions generated by model.

Source code in src/numerblox/prediction_loaders.py

@abstractmethod
def transform(self, X=None, y=None) -> pd.DataFrame:
    """Return Predictions generated by model."""
    ...

`ExamplePredictions`

Bases: BasePredictionLoader

Load example predictions. :param file_name: File to download from NumerAPI. By default this is example predictions for v5.0 data. 'v5.0/live_example_preds.parquet' by default. Example predictions in previous versions: - v5.0. validation examples -> "v5.0/validation_example_preds.parquet" - v5.0. live benchmark models -> "v5.0/live_benchmark_models.parquet" - v5.0. validation benchmark models -> "v5.0/validation_benchmark_models.parquet" :param round_num: Optional round number. Downloads most recent round by default. :param keep_files: Whether to keep downloaded files. By default, files are deleted after the predictions are loaded.

Source code in src/numerblox/prediction_loaders.py

class ExamplePredictions(BasePredictionLoader):
    """
    Load example predictions.
    :param file_name: File to download from NumerAPI.
    By default this is example predictions for v5.0 data.
    'v5.0/live_example_preds.parquet' by default.
    Example predictions in previous versions:
    - v5.0. validation examples -> "v5.0/validation_example_preds.parquet"
    - v5.0. live benchmark models -> "v5.0/live_benchmark_models.parquet"
    - v5.0. validation benchmark models -> "v5.0/validation_benchmark_models.parquet"
    :param round_num: Optional round number. Downloads most recent round by default.
    :param keep_files: Whether to keep downloaded files.
    By default, files are deleted after the predictions are loaded.
    """

    def __init__(self, file_name: str = "v5.0/live_example_preds.parquet", round_num: int = None, keep_files: bool = False):
        super().__init__()
        self.file_name = file_name
        self.round_num = round_num
        self.keep_files = keep_files

    def transform(self, X=None, y=None) -> pd.DataFrame:
        """Return example predictions."""
        self._download_example_preds()
        example_preds = self._load_example_preds()
        if not self.keep_files:
            self.downloader.remove_base_directory()
        return example_preds

    def _download_example_preds(self):
        data_directory = f"example_predictions_loader_{uuid4()}"
        self.downloader = NumeraiClassicDownloader(directory_path=data_directory)
        self.dest_path = f"{str(self.downloader.dir)}/{self.file_name}"
        self.downloader.download_single_dataset(filename=self.file_name, dest_path=self.dest_path, round_num=self.round_num)

    def _load_example_preds(self, *args, **kwargs):
        return pd.read_parquet(self.dest_path, *args, **kwargs)

    def get_feature_names_out(self, input_features=None):
        return [Path(self.file_name).with_suffix("").as_posix()] if not input_features else input_features

`transform(X=None, y=None)`

Return example predictions.

Source code in src/numerblox/prediction_loaders.py

def transform(self, X=None, y=None) -> pd.DataFrame:
    """Return example predictions."""
    self._download_example_preds()
    example_preds = self._load_example_preds()
    if not self.keep_files:
        self.downloader.remove_base_directory()
    return example_preds

Targets

`BaseTargetProcessor`

Bases: TransformerMixin, BaseEstimator

Common functionality for preprocessors and postprocessors.

Source code in src/numerblox/targets.py

class BaseTargetProcessor(TransformerMixin, BaseEstimator):
    """Common functionality for preprocessors and postprocessors."""

    def __init__(self):
        sklearn.set_config(enable_metadata_routing=True)
        self.set_transform_request(era_series=True)

    def fit(self, X, y=None):
        self.is_fitted_ = True
        return self

    @abstractmethod
    def transform(self, X: Union[np.array, pd.DataFrame], y=None) -> pd.DataFrame: ...

    @abstractmethod
    def get_feature_names_out(self, input_features=None) -> List[str]: ...

`BayesianGMMTargetProcessor`

Bases: BaseTargetProcessor

Generate synthetic (fake) target using a Bayesian Gaussian Mixture model.

Based on Michael Oliver's GitHub Gist implementation:

https://gist.github.com/the-moliver/dcdd2862dc2c78dda600f1b449071c93

:param n_components: Number of components for fitting Bayesian Gaussian Mixture Model.

Source code in src/numerblox/targets.py

class BayesianGMMTargetProcessor(BaseTargetProcessor):
    """
    Generate synthetic (fake) target using a Bayesian Gaussian Mixture model. \n
    Based on Michael Oliver's GitHub Gist implementation: \n
    https://gist.github.com/the-moliver/dcdd2862dc2c78dda600f1b449071c93

    :param n_components: Number of components for fitting Bayesian Gaussian Mixture Model.
    """

    def __init__(
        self,
        n_components: int = 3,
    ):
        super().__init__()
        self.set_fit_request(era_series=True)
        self.n_components = n_components
        self.ridge = Ridge(fit_intercept=False)
        self.bins = [0, 0.05, 0.25, 0.75, 0.95, 1]

    def fit(self, X: pd.DataFrame, y: pd.Series, era_series: pd.Series):
        """
        Fit Bayesian Gaussian Mixture model on coefficients and normalize.
        :param X: DataFrame containing features.
        :param y: Series containing real target.
        :param era_series: Series containing era information.
        """
        bgmm = BayesianGaussianMixture(n_components=self.n_components)
        coefs = self._get_coefs(dataf=X, y=y, era_series=era_series)
        bgmm.fit(coefs)
        # make probability of sampling each component equal to better balance rare regimes
        bgmm.weights_[:] = 1 / self.n_components
        self.bgmm_ = bgmm
        self.is_fitted_ = True
        return self

    def transform(self, X: pd.DataFrame, era_series: pd.Series) -> np.array:
        """
        Main method for generating fake target.
        :param X: DataFrame containing features.
        :param era_series: Series containing era information.
        """
        check_is_fitted(self, "bgmm_")
        assert len(X) == len(era_series), "X and eras must be same length."
        all_eras = era_series.unique().tolist()
        # Scale data between 0 and 1
        X = X.astype(float)
        X /= X.max()
        X -= 0.5
        X.loc[:, "era"] = era_series

        fake_target = self._generate_target(dataf=X, all_eras=all_eras)
        return fake_target

    def _get_coefs(self, dataf: pd.DataFrame, y: pd.Series, era_series: pd.Series) -> np.ndarray:
        """
        Generate coefficients for BGMM.
        :param dataf: DataFrame containing features.
        :param y: Series containing real target.
        """
        coefs = []
        dataf.loc[:, "era"] = era_series
        dataf.loc[:, "target"] = y
        all_eras = dataf["era"].unique().tolist()
        for era in all_eras:
            era_df = dataf[dataf["era"] == era]
            era_y = era_df.loc[:, "target"]
            era_df = era_df.drop(columns=["era", "target"])
            self.ridge.fit(era_df, era_y)
            coefs.append(self.ridge.coef_)
        stacked_coefs = np.vstack(coefs)
        return stacked_coefs

    def _generate_target(self, dataf: pd.DataFrame, all_eras: list) -> np.ndarray:
        """Generate fake target using Bayesian Gaussian Mixture model."""
        fake_target = []
        for era in tqdm(all_eras, desc="Generating fake target"):
            features = dataf[dataf["era"] == era]
            features = features.drop(columns=["era", "target"])
            # Sample a set of weights from GMM
            beta, _ = self.bgmm_.sample(1)
            # Create fake continuous target
            fake_targ = features @ beta[0]
            # Bin fake target like real target
            fake_targ = (rankdata(fake_targ) - 0.5) / len(fake_targ)
            fake_targ = (np.digitize(fake_targ, self.bins) - 1) / 4
            fake_target.append(fake_targ)
        return np.concatenate(fake_target)

    def get_feature_names_out(self, input_features=None) -> List[str]:
        """Return feature names."""
        return ["fake_target"] if not input_features else input_features

`_generate_target(dataf, all_eras)`

Generate fake target using Bayesian Gaussian Mixture model.

Source code in src/numerblox/targets.py

def _generate_target(self, dataf: pd.DataFrame, all_eras: list) -> np.ndarray:
    """Generate fake target using Bayesian Gaussian Mixture model."""
    fake_target = []
    for era in tqdm(all_eras, desc="Generating fake target"):
        features = dataf[dataf["era"] == era]
        features = features.drop(columns=["era", "target"])
        # Sample a set of weights from GMM
        beta, _ = self.bgmm_.sample(1)
        # Create fake continuous target
        fake_targ = features @ beta[0]
        # Bin fake target like real target
        fake_targ = (rankdata(fake_targ) - 0.5) / len(fake_targ)
        fake_targ = (np.digitize(fake_targ, self.bins) - 1) / 4
        fake_target.append(fake_targ)
    return np.concatenate(fake_target)

`_get_coefs(dataf, y, era_series)`

Generate coefficients for BGMM. :param dataf: DataFrame containing features. :param y: Series containing real target.

Source code in src/numerblox/targets.py

def _get_coefs(self, dataf: pd.DataFrame, y: pd.Series, era_series: pd.Series) -> np.ndarray:
    """
    Generate coefficients for BGMM.
    :param dataf: DataFrame containing features.
    :param y: Series containing real target.
    """
    coefs = []
    dataf.loc[:, "era"] = era_series
    dataf.loc[:, "target"] = y
    all_eras = dataf["era"].unique().tolist()
    for era in all_eras:
        era_df = dataf[dataf["era"] == era]
        era_y = era_df.loc[:, "target"]
        era_df = era_df.drop(columns=["era", "target"])
        self.ridge.fit(era_df, era_y)
        coefs.append(self.ridge.coef_)
    stacked_coefs = np.vstack(coefs)
    return stacked_coefs

`fit(X, y, era_series)`

Fit Bayesian Gaussian Mixture model on coefficients and normalize. :param X: DataFrame containing features. :param y: Series containing real target. :param era_series: Series containing era information.

Source code in src/numerblox/targets.py

def fit(self, X: pd.DataFrame, y: pd.Series, era_series: pd.Series):
    """
    Fit Bayesian Gaussian Mixture model on coefficients and normalize.
    :param X: DataFrame containing features.
    :param y: Series containing real target.
    :param era_series: Series containing era information.
    """
    bgmm = BayesianGaussianMixture(n_components=self.n_components)
    coefs = self._get_coefs(dataf=X, y=y, era_series=era_series)
    bgmm.fit(coefs)
    # make probability of sampling each component equal to better balance rare regimes
    bgmm.weights_[:] = 1 / self.n_components
    self.bgmm_ = bgmm
    self.is_fitted_ = True
    return self

`get_feature_names_out(input_features=None)`

Return feature names.

Source code in src/numerblox/targets.py

def get_feature_names_out(self, input_features=None) -> List[str]:
    """Return feature names."""
    return ["fake_target"] if not input_features else input_features

`transform(X, era_series)`

Main method for generating fake target. :param X: DataFrame containing features. :param era_series: Series containing era information.

Source code in src/numerblox/targets.py

def transform(self, X: pd.DataFrame, era_series: pd.Series) -> np.array:
    """
    Main method for generating fake target.
    :param X: DataFrame containing features.
    :param era_series: Series containing era information.
    """
    check_is_fitted(self, "bgmm_")
    assert len(X) == len(era_series), "X and eras must be same length."
    all_eras = era_series.unique().tolist()
    # Scale data between 0 and 1
    X = X.astype(float)
    X /= X.max()
    X -= 0.5
    X.loc[:, "era"] = era_series

    fake_target = self._generate_target(dataf=X, all_eras=all_eras)
    return fake_target

`SignalsTargetProcessor`

Bases: BaseTargetProcessor

Engineer targets for Numerai Signals.

More information on implements Numerai Signals targets:

https://forum.numer.ai/t/decoding-the-signals-target/2501

:param price_col: Column from which target will be derived.

:param windows: Timeframes to use for engineering targets. 10 and 20-day by default.

:param bins: Binning used to create group targets. Nomi binning by default.

:param labels: Scaling for binned target. Must be same length as resulting bins (bins-1). Numerai labels by default.

Source code in src/numerblox/targets.py

class SignalsTargetProcessor(BaseTargetProcessor):
    """
    Engineer targets for Numerai Signals. \n
    More information on implements Numerai Signals targets: \n
    https://forum.numer.ai/t/decoding-the-signals-target/2501

    :param price_col: Column from which target will be derived. \n
    :param windows: Timeframes to use for engineering targets. 10 and 20-day by default. \n
    :param bins: Binning used to create group targets. Nomi binning by default. \n
    :param labels: Scaling for binned target. Must be same length as resulting bins (bins-1). Numerai labels by default.
    """

    def __init__(
        self,
        price_col: str = "close",
        windows: list = None,
        bins: list = None,
        labels: list = None,
    ):
        super().__init__()
        self.price_col = price_col
        self.windows = windows if windows else [10, 20]
        self.bins = bins if bins else [0, 0.05, 0.25, 0.75, 0.95, 1]
        self.labels = labels if labels else [0, 0.25, 0.50, 0.75, 1]

    def transform(self, dataf: pd.DataFrame, era_series: pd.Series) -> np.array:
        for window in tqdm(self.windows, desc="Signals target engineering windows"):
            dataf.loc[:, f"target_{window}d_raw"] = dataf[self.price_col].pct_change(periods=window).shift(-window)
            era_groups = dataf.groupby(era_series)

            dataf.loc[:, f"target_{window}d_rank"] = era_groups[f"target_{window}d_raw"].rank(pct=True, method="first")
            dataf.loc[:, f"target_{window}d_group"] = era_groups[f"target_{window}d_rank"].transform(lambda group: pd.cut(group, bins=self.bins, labels=self.labels, include_lowest=True))
        output_cols = self.get_feature_names_out()
        return dataf[output_cols].to_numpy()

    def get_feature_names_out(self, input_features=None) -> List[str]:
        """Return feature names of Signals targets."""
        if not input_features:
            feature_names = []
            for window in self.windows:
                feature_names.append(f"target_{window}d_raw")
                feature_names.append(f"target_{window}d_rank")
                feature_names.append(f"target_{window}d_group")
        else:
            feature_names = input_features
        return feature_names

`get_feature_names_out(input_features=None)`

Return feature names of Signals targets.

Source code in src/numerblox/targets.py

def get_feature_names_out(self, input_features=None) -> List[str]:
    """Return feature names of Signals targets."""
    if not input_features:
        feature_names = []
        for window in self.windows:
            feature_names.append(f"target_{window}d_raw")
            feature_names.append(f"target_{window}d_rank")
            feature_names.append(f"target_{window}d_group")
    else:
        feature_names = input_features
    return feature_names

Evaluation

`BaseEvaluator`

Evaluation functionality that is relevant for both Numerai Classic and Numerai Signals.

Metrics include: - Mean, Standard Deviation and Sharpe (Corrv2) for era returns. - Max drawdown. - Annual Percentage Yield (APY). - Correlation with benchmark predictions. - Max feature exposure: https://forum.numer.ai/t/model-diagnostics-feature-exposure/899. - Feature Neutral Mean, Standard deviation and Sharpe: https://docs.numer.ai/tournament/feature-neutral-correlation. - Smart Sharpe - Exposure Dissimilarity: https://forum.numer.ai/t/true-contribution-details/5128/4. - Autocorrelation (1st order). - Calmar Ratio. - Churn: https://forum.numer.ai/t/better-lgbm-params-signals-v2-data-and-reducing-signals-churn/7638#p-17958-reducing-signals-churn-3. - Performance vs. Benchmark predictions. - Mean, Standard Deviation, Sharpe and Churn for TB200 (Buy top 200 stocks and sell bottom 200 stocks). - Mean, Standard Deviation, Sharpe and Churn for TB500 (Buy top 500 stocks and sell bottom 500 stocks).

:param metrics_list: List of metrics to calculate. Default: FAST_METRICS. :param era_col: Column name pointing to eras. Most commonly "era" for Numerai Classic and "date" for Numerai Signals. :param custom_functions: Additional functions called in evaluation. Check out the NumerBlox docs on evaluation for more info on using custom functions. :param show_detailed_progress_bar: Show detailed progress bar for evaluation of each prediction column.

Note that we calculate the sample standard deviation with ddof=0. It may differ slightly from the standard Pandas calculation, but is consistent with how NumPy computes standard deviation. More info: https://stackoverflow.com/questions/24984178/different-std-in-pandas-vs-numpy

Source code in src/numerblox/evaluation.py

class BaseEvaluator:
    """
    Evaluation functionality that is relevant for both
    Numerai Classic and Numerai Signals.

    Metrics include:
    - Mean, Standard Deviation and Sharpe (Corrv2) for era returns.
    - Max drawdown.
    - Annual Percentage Yield (APY).
    - Correlation with benchmark predictions.
    - Max feature exposure: https://forum.numer.ai/t/model-diagnostics-feature-exposure/899.
    - Feature Neutral Mean, Standard deviation and Sharpe: https://docs.numer.ai/tournament/feature-neutral-correlation.
    - Smart Sharpe
    - Exposure Dissimilarity: https://forum.numer.ai/t/true-contribution-details/5128/4.
    - Autocorrelation (1st order).
    - Calmar Ratio.
    - Churn: https://forum.numer.ai/t/better-lgbm-params-signals-v2-data-and-reducing-signals-churn/7638#p-17958-reducing-signals-churn-3.
    - Performance vs. Benchmark predictions.
    - Mean, Standard Deviation, Sharpe and Churn for TB200 (Buy top 200 stocks and sell bottom 200 stocks).
    - Mean, Standard Deviation, Sharpe and Churn for TB500 (Buy top 500 stocks and sell bottom 500 stocks).

    :param metrics_list: List of metrics to calculate. Default: FAST_METRICS.
    :param era_col: Column name pointing to eras. Most commonly "era" for Numerai Classic and "date" for Numerai Signals.
    :param custom_functions: Additional functions called in evaluation.
    Check out the NumerBlox docs on evaluation for more info on using custom functions.
    :param show_detailed_progress_bar: Show detailed progress bar for evaluation of each prediction column.

    Note that we calculate the sample standard deviation with ddof=0.
    It may differ slightly from the standard Pandas calculation, but
    is consistent with how NumPy computes standard deviation.
    More info:
    https://stackoverflow.com/questions/24984178/different-std-in-pandas-vs-numpy
    """

    def __init__(
        self,
        metrics_list: List[str],
        era_col: str,
        custom_functions: Dict[str, Dict[str, Any]],
        show_detailed_progress_bar: bool,
    ):
        self.era_col = era_col
        self.metrics_list = metrics_list
        self.custom_functions = custom_functions
        if self.custom_functions is not None:
            self.check_custom_functions()
        for metric in self.metrics_list:
            assert metric in ALL_METRICS, f"Metric '{metric}' not found. Make sure to use one of the following: {ALL_METRICS}."
        self.show_detailed_progress_bar = show_detailed_progress_bar
        self.n_bench_metrics = sum(1 for metric in self.metrics_list if metric in BENCH_METRICS)

    def full_evaluation(
        self,
        dataf: pd.DataFrame,
        pred_cols: List[str],
        target_col: str = "target",
        benchmark_cols: list = None,
    ) -> pd.DataFrame:
        """
        Perform evaluation for each prediction column in pred_cols.
        By default only the "prediction" column is evaluated.
        Evaluation is done against given target and benchmark prediction column.
        :param dataf: DataFrame containing era_col, pred_cols, target_col and optional benchmark_cols.
        :param pred_cols: List of prediction columns to calculate evaluation metrics for.
        :param target_col: Target column to evaluate against.
        :param benchmark_cols: Optional list of benchmark columns to calculate evaluation metrics for.
        """
        val_stats = pd.DataFrame()
        feature_cols = [col for col in dataf.columns if col.startswith("feature")]
        cat_cols = dataf[feature_cols].select_dtypes(include=["category"]).columns.to_list()
        if cat_cols:
            print(f"WARNING: Categorical features detected that cannot be used for neutralization. Removing columns: '{cat_cols}' for evaluation.")
            dataf.loc[:, feature_cols] = dataf[feature_cols].select_dtypes(exclude=["category"])
        dataf = dataf.fillna(0.5)
        for col in tqdm(pred_cols, desc="Evaluation: "):
            col_stats = self.evaluation_one_col(
                dataf=dataf,
                pred_col=col,
                feature_cols=feature_cols,
                target_col=target_col,
                benchmark_cols=benchmark_cols,
            )
            val_stats = pd.concat([val_stats, col_stats], axis=0)
        return val_stats

    def evaluation_one_col(
        self,
        dataf: pd.DataFrame,
        feature_cols: list,
        pred_col: str,
        target_col: str,
        benchmark_cols: list = None,
    ):
        """
        Perform evaluation for one prediction column
        against given target and benchmark column(s).
        """
        # Era column check
        assert self.era_col in dataf.columns, f"Era column '{self.era_col}' not found in DataFrame. Make sure to set the correct era_col."
        # Prediction column check
        assert pred_col in dataf.columns, f"Prediction column '{pred_col}' not found in DataFrame. Make sure to set the correct pred_col."
        assert self._check_column_range(dataf[pred_col]), "All predictions should be between 0 and 1 (inclusive)."
        # Target column check
        assert target_col in dataf.columns, f"Target column '{target_col}' not found in DataFrame. Make sure to set the correct target_col."
        assert self._check_column_range(dataf[target_col]), "All targets should be between 0 and 1 (inclusive)."
        # Benchmark columns check
        if benchmark_cols is not None:
            for col in benchmark_cols:
                assert col in dataf.columns, f"Benchmark column '{col}' not found in DataFrame. Make sure to set the correct benchmark_cols."
                assert self._check_column_range(dataf[col]), f"All predictions for '{col}' should be between 0 and 1 (inclusive)."

        # Set up progress bar
        if self.show_detailed_progress_bar:
            len_custom_functions = 0 if self.custom_functions is None else len(list(self.custom_functions.keys()))
            len_pbar = len(self.metrics_list) + len_custom_functions
            pbar = tqdm(total=len_pbar, desc="Evaluation")
        else:
            pbar = None

        col_stats = {}
        col_stats["target"] = target_col

        # Compute stats per era (only if needed)
        per_era_numerai_corrs = self.per_era_numerai_corrs(dataf=dataf, pred_col=pred_col, target_col=target_col)

        # check if mean, std, or sharpe are in metrics_list
        if "mean_std_sharpe" in self.metrics_list:
            self.update_progress_bar(pbar, desc="mean_std_sharpe for evaluation")
            mean, std, sharpe = self.mean_std_sharpe(era_corrs=per_era_numerai_corrs)
            col_stats["mean"] = mean
            col_stats["std"] = std
            col_stats["sharpe"] = sharpe

        if "legacy_mean_std_sharpe" in self.metrics_list:
            self.update_progress_bar(pbar, desc="legacy_mean_std_sharpe for evaluation")
            per_era_corrs = self.per_era_corrs(dataf=dataf, pred_col=pred_col, target_col=target_col)
            legacy_mean, legacy_std, legacy_sharpe = self.mean_std_sharpe(era_corrs=per_era_corrs)
            col_stats["legacy_mean"] = legacy_mean
            col_stats["legacy_std"] = legacy_std
            col_stats["legacy_sharpe"] = legacy_sharpe

        if "max_drawdown" in self.metrics_list:
            self.update_progress_bar(pbar, desc="max_drawdown for evaluation")
            col_stats["max_drawdown"] = self.max_drawdown(era_corrs=per_era_numerai_corrs)

        if "apy" in self.metrics_list:
            self.update_progress_bar(pbar, desc="apy for evaluation")
            col_stats["apy"] = self.apy(era_corrs=per_era_numerai_corrs)

        if "calmar_ratio" in self.metrics_list:
            self.update_progress_bar(pbar, desc="calmar_ratio for evaluation")
            # Max drawdown and APY need for calmar ratio calculation
            if "max_drawdown" not in self.metrics_list:
                col_stats["max_drawdown"] = self.max_drawdown(era_corrs=per_era_numerai_corrs)
            if "apy" not in self.metrics_list:
                col_stats["apy"] = self.apy(era_corrs=per_era_numerai_corrs)
            col_stats["calmar_ratio"] = np.nan if col_stats["max_drawdown"] == 0 else col_stats["apy"] / -col_stats["max_drawdown"]

        if "autocorrelation" in self.metrics_list:
            self.update_progress_bar(pbar, desc="autocorrelation for evaluation")
            col_stats["autocorrelation"] = self.autocorr1(per_era_numerai_corrs)

        if "max_feature_exposure" in self.metrics_list:
            self.update_progress_bar(pbar, desc="max_feature_exposure for evaluation")
            col_stats["max_feature_exposure"] = self.max_feature_exposure(dataf=dataf, feature_cols=feature_cols, pred_col=pred_col)

        if "smart_sharpe" in self.metrics_list:
            self.update_progress_bar(pbar, desc="smart_sharpe for evaluation")
            col_stats["smart_sharpe"] = self.smart_sharpe(era_corrs=per_era_numerai_corrs)

        if "churn" in self.metrics_list:
            self.update_progress_bar(pbar, desc="churn for evaluation")
            col_stats["churn"] = self.churn(dataf=dataf, pred_col=pred_col, target_col=target_col, tb=None)
        if "tb200_churn" in self.metrics_list:
            self.update_progress_bar(pbar, desc="tb200_churn for evaluation")
            col_stats["tb200_churn"] = self.churn(dataf=dataf, pred_col=pred_col, target_col=target_col, tb=200)
        if "tb500_churn" in self.metrics_list:
            self.update_progress_bar(pbar, desc="tb500_churn for evaluation")
            col_stats["tb500_churn"] = self.churn(dataf=dataf, pred_col=pred_col, target_col=target_col, tb=500)

        if benchmark_cols is not None:
            for bench_col in benchmark_cols:
                per_era_bench_corrs = self.per_era_numerai_corrs(dataf=dataf, pred_col=bench_col, target_col=target_col)

                if "mean_std_sharpe" in self.metrics_list:
                    bench_mean, bench_std, bench_sharpe = self.mean_std_sharpe(era_corrs=per_era_bench_corrs)
                    col_stats[f"mean_vs_{bench_col}"] = mean - bench_mean
                    col_stats[f"std_vs_{bench_col}"] = std - bench_std
                    col_stats[f"sharpe_vs_{bench_col}"] = sharpe - bench_sharpe

                if "mc_mean_std_sharpe" in self.metrics_list:
                    mc_scores = self.contributive_correlation(
                        dataf=dataf,
                        pred_col=pred_col,
                        target_col=target_col,
                        other_col=bench_col,
                    )
                    col_stats[f"mc_mean_{bench_col}"] = np.nanmean(mc_scores)
                    col_stats[f"mc_std_{bench_col}"] = np.nanstd(mc_scores)
                    col_stats[f"mc_sharpe_{bench_col}"] = np.nan if col_stats[f"mc_std_{bench_col}"] == 0 else col_stats[f"mc_mean_{bench_col}"] / col_stats[f"mc_std_{bench_col}"]

                if "corr_with" in self.metrics_list:
                    col_stats[f"corr_with_{bench_col}"] = self.cross_correlation(dataf=dataf, pred_col=bench_col, other_col=bench_col)

                if "legacy_mc_mean_std_sharpe" in self.metrics_list:
                    legacy_mc_scores = self.legacy_contribution(
                        dataf=dataf,
                        pred_col=pred_col,
                        target_col=target_col,
                        other_col=bench_col,
                    )
                    col_stats[f"legacy_mc_mean_{bench_col}"] = np.nanmean(legacy_mc_scores)
                    col_stats[f"legacy_mc_std_{bench_col}"] = np.nanstd(legacy_mc_scores)
                    col_stats[f"legacy_mc_sharpe_{bench_col}"] = np.nan if col_stats[f"legacy_mc_std_{bench_col}"] == 0 else col_stats[f"legacy_mc_mean_{bench_col}"] / col_stats[f"legacy_mc_std_{bench_col}"]

                if "ex_diss" in self.metrics_list or "ex_diss_pearson" in self.metrics_list:
                    col_stats[f"exposure_dissimilarity_pearson_{bench_col}"] = self.exposure_dissimilarity(dataf=dataf, pred_col=pred_col, other_col=bench_col, corr_method="pearson")
                if "ex_diss_spearman" in self.metrics_list:
                    col_stats[f"exposure_dissimilarity_spearman_{bench_col}"] = self.exposure_dissimilarity(dataf=dataf, pred_col=pred_col, other_col=bench_col, corr_method="spearman")
            self.update_progress_bar(pbar, desc=f"Finished evaluation for '{self.n_bench_metrics}' benchmark metrics", update=self.n_bench_metrics)

        # Compute intensive stats
        if "fn_mean_std_sharpe" in self.metrics_list:
            self.update_progress_bar(pbar, desc="fn_mean_std_sharpe for evaluation")
            fn_mean, fn_std, fn_sharpe = self.feature_neutral_mean_std_sharpe(
                dataf=dataf,
                pred_col=pred_col,
                target_col=target_col,
                feature_names=feature_cols,
            )
            col_stats["feature_neutral_mean"] = fn_mean
            col_stats["feature_neutral_std"] = fn_std
            col_stats["feature_neutral_sharpe"] = fn_sharpe

        if "tb200_mean_std_sharpe" in self.metrics_list:
            self.update_progress_bar(pbar, desc="tb200_mean_std_sharpe for evaluation")
            tb200_mean, tb200_std, tb200_sharpe = self.tbx_mean_std_sharpe(dataf=dataf, pred_col=pred_col, target_col=target_col, tb=200)
            col_stats["tb200_mean"] = tb200_mean
            col_stats["tb200_std"] = tb200_std
            col_stats["tb200_sharpe"] = tb200_sharpe

        if "tb500_mean_std_sharpe" in self.metrics_list:
            self.update_progress_bar(pbar, desc="tb500_mean_std_sharpe for evaluation")
            tb500_mean, tb500_std, tb500_sharpe = self.tbx_mean_std_sharpe(dataf=dataf, pred_col=pred_col, target_col=target_col, tb=500)
            col_stats["tb500_mean"] = tb500_mean
            col_stats["tb500_std"] = tb500_std
            col_stats["tb500_sharpe"] = tb500_sharpe

        # Custom functions
        if self.custom_functions is not None:
            local_vars = locals()
            for func_name, func_info in self.custom_functions.items():
                self.update_progress_bar(pbar, desc=f"custom function: '{func_name}' for evaluation")

                func = func_info["func"]
                args = func_info["args"]
                local_args = func_info["local_args"]
                resolved_args = {}
                for k, v in args.items():
                    # Resolve variables defined as local args
                    if isinstance(v, str) and v in local_args:
                        if v not in local_vars:
                            raise ValueError(f"Variable '{v}' was defined in 'local_args', but was not found in local variables. Make sure to set the correct local_args.")
                        else:
                            resolved_args[k] = local_vars[v]
                    else:
                        resolved_args[k] = v
                col_stats[func_name] = func(**resolved_args)

        col_stats_df = pd.DataFrame(col_stats, index=[pred_col])
        self.update_progress_bar(pbar, desc="Finished evaluation", close=True)
        return col_stats_df

    def update_progress_bar(self, pbar, desc: str, update: int = 1, close: bool = False):
        """
        Update progress bar for evaluation.
        :param pbar: tqdm progress bar object.
        :param desc: Description to show in progress bar.
        :param update: Update progress by n steps.
        :param close: Close progress bar.
        """
        if self.show_detailed_progress_bar and pbar is not None:
            pbar.set_description_str(desc)
            pbar.update(update)
            if close:
                pbar.close()

    def per_era_corrs(self, dataf: pd.DataFrame, pred_col: str, target_col: str) -> pd.Series:
        """Correlation between prediction and target for each era."""
        return dataf.groupby(self.era_col).apply(lambda d: self._normalize_uniform(d[pred_col].fillna(0.5)).corr(d[target_col]))

    def per_era_numerai_corrs(self, dataf: pd.DataFrame, pred_col: str, target_col: str) -> pd.Series:
        """Numerai Corr between prediction and target for each era."""
        return dataf.groupby(self.era_col).apply(lambda d: self.numerai_corr(d.fillna(0.5), pred_col, target_col))

    def mean_std_sharpe(self, era_corrs: pd.Series) -> Tuple[np.float64, np.float64, np.float64]:
        """
        Average, standard deviation and Sharpe ratio for
        correlations per era.
        """
        mean = pd.Series(era_corrs.mean()).item()
        std = pd.Series(era_corrs.std(ddof=0)).item()
        sharpe = np.nan if std == 0 else mean / std
        return mean, std, sharpe

    def numerai_corr(self, dataf: pd.DataFrame, pred_col: str, target_col: str) -> np.float64:
        """
        Computes 'Numerai Corr' aka 'Corrv2'.
        More info: https://forum.numer.ai/t/target-cyrus-new-primary-target/6303

        Assumes original target col as input (i.e. in [0...1] range).
        """
        # Rank and gaussianize predictions
        ranked_preds = self._normalize_uniform(dataf[pred_col].fillna(0.5), method="average")
        gauss_ranked_preds = stats.norm.ppf(ranked_preds)
        # Center target from [0...1] to [-0.5...0.5] range
        targets = dataf[target_col]
        centered_target = targets - targets.mean()
        # Accentuate tails of predictions and targets
        preds_p15 = np.sign(gauss_ranked_preds) * np.abs(gauss_ranked_preds) ** 1.5
        target_p15 = np.sign(centered_target) * np.abs(centered_target) ** 1.5
        # Pearson correlation
        corr, _ = stats.pearsonr(preds_p15, target_p15)
        return corr

    @staticmethod
    def max_drawdown(era_corrs: pd.Series) -> np.float64:
        """Maximum drawdown per era."""
        # Arbitrarily large window
        rolling_max = (era_corrs + 1).cumprod().rolling(window=9000, min_periods=1).max()
        daily_value = (era_corrs + 1).cumprod()
        max_drawdown = -((rolling_max - daily_value) / rolling_max).max()
        return max_drawdown

    @staticmethod
    def apy(era_corrs: pd.Series, stake_compounding_lag: int = 4) -> np.float64:
        """
        Annual percentage yield.
        :param era_corrs: Correlation scores by era
        :param stake_compounding_lag: Compounding lag for Numerai rounds (4 for Numerai Classic)
        """
        payout_scores = era_corrs.clip(-0.25, 0.25)
        payout_product = (payout_scores + 1).prod()
        return (
            payout_product
            ** (
                # 52 weeks of compounding minus n for stake compounding lag
                (52 - stake_compounding_lag) / len(payout_scores)
            )
            - 1
        ) * 100

    def cross_correlation(self, dataf: pd.DataFrame, pred_col: str, other_col: str):
        """
        Corrv2 correlation with other predictions (like another model, example predictions or meta model prediction).
        :param dataf: DataFrame containing both pred_col and other_col.
        :param pred_col: Main Prediction.
        :param other_col: Other prediction column to calculate correlation with pred_col.

        :return: Correlation between Corrv2's of pred_col and other_col.
        """
        return self.per_era_numerai_corrs(
            dataf=dataf,
            pred_col=pred_col,
            target_col=other_col,
        ).mean()

    def max_feature_exposure(self, dataf: pd.DataFrame, feature_cols: List[str], pred_col: str) -> np.float64:
        """Maximum exposure over all features."""
        max_per_era = dataf.groupby(self.era_col).apply(lambda d: d[feature_cols].corrwith(d[pred_col]).abs().max())
        max_feature_exposure = max_per_era.mean(skipna=True)
        return max_feature_exposure

    def feature_neutral_mean_std_sharpe(self, dataf: pd.DataFrame, pred_col: str, target_col: str, feature_names: list) -> Tuple[np.float64, np.float64, np.float64]:
        """
        Feature neutralized mean performance.
        More info: https://docs.numer.ai/tournament/feature-neutral-correlation
        """
        fn = FeatureNeutralizer(pred_name=pred_col, proportion=1.0)
        fn.set_predict_request(features=True, era_series=True)
        neutralized_preds = fn.predict(dataf[pred_col], features=dataf[feature_names], era_series=dataf[self.era_col])
        # Construct new DataFrame with era col, target col and preds
        neutralized_dataf = pd.DataFrame(columns=[self.era_col, target_col, pred_col])
        neutralized_dataf[self.era_col] = dataf[self.era_col]
        neutralized_dataf[target_col] = dataf[target_col]
        neutralized_dataf[pred_col] = neutralized_preds

        neutral_corrs = self.per_era_numerai_corrs(
            dataf=neutralized_dataf,
            pred_col=pred_col,
            target_col=target_col,
        )
        mean, std, sharpe = self.mean_std_sharpe(era_corrs=neutral_corrs)
        return mean, std, sharpe

    def tbx_mean_std_sharpe(self, dataf: pd.DataFrame, pred_col: str, target_col: str, tb: int = 200) -> Tuple[np.float64, np.float64, np.float64]:
        """
        Calculate Mean, Standard deviation and Sharpe ratio
        when we focus on the x top and x bottom predictions.
        :param tb: How many of top and bottom predictions to focus on.
        TB200 and TB500 are the most common situations.
        """
        tb_val_corrs = self._score_by_date(dataf=dataf, columns=[pred_col], target=target_col, tb=tb)
        return self.mean_std_sharpe(era_corrs=tb_val_corrs)

    def exposure_dissimilarity(self, dataf: pd.DataFrame, pred_col: str, other_col: str, corr_method: str = "pearson") -> np.float32:
        """
        Model pattern of feature exposure to the another column.
        See TC details forum post: https://forum.numer.ai/t/true-contribution-details/5128/4
        :param dataf: DataFrame containing both pred_col and other_col.
        :param pred_col: Main Prediction.
        :param other_col: Other prediction column to calculate exposure dissimilarity against.
        :param corr_method: Correlation method to use for calculating feature exposures.
        corr_method should be one of ['pearson', 'kendall', 'spearman']. Default: 'pearson'.
        """
        assert corr_method in ["pearson", "kendall", "spearman"], f"corr_method should be one of ['pearson', 'kendall', 'spearman']. Got: '{corr_method}'"
        feature_cols = [col for col in dataf.columns if col.startswith("feature")]
        U = dataf[feature_cols].corrwith(dataf[pred_col], method=corr_method).values
        E = dataf[feature_cols].corrwith(dataf[other_col], method=corr_method).values

        denominator = np.dot(E, E)
        if denominator == 0:
            exp_dis = 0
        else:
            exp_dis = 1 - np.dot(U, E) / denominator
        return exp_dis

    @staticmethod
    def _neutralize_series(series: pd.Series, by: pd.Series, proportion=1.0) -> pd.Series:
        scores = series.values.reshape(-1, 1)
        exposures = by.values.reshape(-1, 1)

        # This line makes series neutral to a constant column so that it's centered and for sure gets corr 0 with exposures
        exposures = np.hstack((exposures, np.array([np.nanmean(series)] * len(exposures)).reshape(-1, 1)))

        correction = proportion * (exposures.dot(np.linalg.lstsq(exposures, scores, rcond=None)[0]))
        corrected_scores = scores - correction
        neutralized = pd.Series(corrected_scores.ravel(), index=series.index)
        return neutralized

    @staticmethod
    def _orthogonalize(v: np.ndarray, u: np.ndarray) -> np.ndarray:
        """Orthogonalizes v with respect to u by projecting v onto u,
        then subtracting that projection from v.

        This will reach the same result as the neutralize function when v and u
        are single column vectors, but this is much faster.

        Arguments:
            v: np.ndarray - the vector to orthogonalize
            u: np.ndarray - the vector to orthogonalize v against

        Returns:
            np.ndarray - the orthogonalized vector v
        """
        # Calculate the dot product of u and v
        dot_product = u.T @ v

        # Calculate the projection of v onto u
        projection = (dot_product / (u.T @ u)) * u

        # Subtract the projection from v
        return v - projection

    def _score_by_date(self, dataf: pd.DataFrame, columns: list, target: str, tb: int = None):
        """
        Get era correlation based on given TB (x top and bottom predictions).
        :param tb: How many of top and bottom predictions to focus on.
        TB200 is the most common situation.
        """
        unique_eras = dataf[self.era_col].unique()
        computed = []
        for u in unique_eras:
            df_era = dataf[dataf[self.era_col] == u]
            era_pred = np.float64(df_era[columns].values.T)
            era_target = np.float64(df_era[target].values.T)

            if tb is None:
                ccs = np.corrcoef(era_target, era_pred)[0, 1:]
            else:
                tbidx = np.argsort(era_pred, axis=1)
                tbidx = np.concatenate([tbidx[:, :tb], tbidx[:, -tb:]], axis=1)
                ccs = [np.corrcoef(era_target[idx], pred[idx])[0, 1] for idx, pred in zip(tbidx, era_pred)]
                ccs = np.array(ccs)
            computed.append(ccs)
        return pd.DataFrame(np.array(computed), columns=columns, index=dataf[self.era_col].unique())

    @staticmethod
    def _normalize_uniform(df: pd.DataFrame, method: str = "first") -> pd.Series:
        """
        Normalize predictions uniformly using ranks.
        NOTE: Make sure the range of predictions is [0, 1] (inclusive).
        """
        x = (df.rank(method=method) - 0.5) / len(df)  # TODO: Evaluate if subtracting df.mean() is better
        return pd.Series(x, index=df.index)

    def get_feature_exposures_pearson(
        self,
        dataf: pd.DataFrame,
        pred_col: str,
        feature_list: List[str],
        cpu_cores: int = -1,
    ) -> pd.DataFrame:
        """
        Calculate feature exposures for each era using Pearson correlation.

        :param dataf: DataFrame containing predictions, features, and eras.
        :param pred_col: Prediction column to calculate feature exposures for.
        :param feature_list: List of feature columns in X.
        :param cpu_cores: Number of CPU cores to use for parallelization.
        :return: DataFrame with Pearson feature exposures by era for each feature.
        """

        def calculate_era_pearson_exposure(era, group, feature_list, pred_col_normalized):
            data_matrix = group[feature_list + [pred_col_normalized]].values
            correlations = np.corrcoef(data_matrix, rowvar=False)

            # Get the correlations of all features with the predictions (which is the last column)
            feature_correlations = correlations[:-1, -1]
            return era, feature_correlations

        normalized_ranks = (dataf[[pred_col]].rank(method="first") - 0.5) / len(dataf)
        dataf[f"{pred_col}_normalized"] = stats.norm.ppf(normalized_ranks)
        feature_exposure_data = pd.DataFrame(index=dataf["era"].unique(), columns=feature_list)

        grouped_data = list(dataf.groupby("era"))

        results = Parallel(n_jobs=cpu_cores)(delayed(calculate_era_pearson_exposure)(era, group, feature_list, f"{pred_col}_normalized") for era, group in grouped_data)

        for era, feature_correlations in results:
            feature_exposure_data.loc[era, :] = feature_correlations
        return feature_exposure_data

    def get_feature_exposures_corrv2(
        self,
        dataf: pd.DataFrame,
        pred_col: str,
        feature_list: List[str],
        cpu_cores: int = -1,
    ) -> pd.DataFrame:
        """
        Calculate feature exposures for each era using 'Numerai Corr'.
        Results will be similar to get_feature_exposures() but more accurate.
        This method will take longer to compute.

        :param dataf: DataFrame containing predictions, features, and eras.
        :param pred_col: Prediction column to calculate feature exposures for.
        :param feature_list: List of feature columns in X.
        :param cpu_cores: Number of CPU cores to use for parallelization.
        Default: -1 (all cores).
        :return: DataFrame with Corrv2 feature exposures by era for each feature.
        """

        def calculate_era_feature_exposure(era, group, pred_col, feature_list):
            exposures = {}
            for feature in feature_list:
                corr = self.numerai_corr(group, pred_col=f"{pred_col}_normalized", target_col=feature)
                exposures[feature] = corr
            return era, exposures

        normalized_ranks = (dataf[[pred_col]].rank(method="first") - 0.5) / len(dataf)
        dataf[f"{pred_col}_normalized"] = stats.norm.ppf(normalized_ranks)
        feature_exposure_data = pd.DataFrame(index=dataf["era"].unique(), columns=feature_list)

        grouped_data = list(dataf.groupby("era"))

        results = Parallel(n_jobs=cpu_cores)(delayed(calculate_era_feature_exposure)(era, group, pred_col, feature_list) for era, group in grouped_data)
        for era, exposures in results:
            feature_exposure_data.loc[era, :] = exposures
        return feature_exposure_data

    def smart_sharpe(self, era_corrs: pd.Series) -> np.float64:
        """
        Sharpe adjusted for autocorrelation.
        :param era_corrs: Correlation scores by era
        """
        return np.nanmean(era_corrs) / (np.nanstd(era_corrs, ddof=1) * self.autocorr_penalty(era_corrs))

    def autocorr_penalty(self, era_corrs: pd.Series) -> np.float64:
        """
        Adjusting factor for autocorrelation. Used in Smart Sharpe.
        :param era_corrs: Correlation scores by era.
        """
        n = len(era_corrs)
        # 1st order autocorrelation
        p = self.autocorr1(era_corrs)
        return np.sqrt(1 + 2 * np.sum([((n - i) / n) * p**i for i in range(1, n)]))

    def autocorr1(self, era_corrs: pd.Series) -> np.float64:
        """
        1st order autocorrelation.
        :param era_corrs: Correlation scores by era.
        """
        return np.corrcoef(era_corrs[:-1], era_corrs[1:])[0, 1]

    def legacy_contribution(self, dataf: pd.DataFrame, pred_col: str, target_col: str, other_col: str):
        """
        Legacy contibution mean, standard deviation and sharpe ratio.
        More info: https://forum.numer.ai/t/mmc2-announcement/93

        :param dataf: DataFrame containing era_col, pred_col, target_col and other_col.
        :param pred_col: Prediction column to calculate MMC for.
        :param target_col: Target column to calculate MMC against.
        :param other_col: Meta model column containing predictions to neutralize against.

        :return: List of legacy contribution scores by era.
        """
        legacy_mc_scores = []
        # Standard deviation of a uniform distribution
        COVARIANCE_FACTOR = 0.29**2
        # Calculate MMC for each era
        for _, x in dataf.groupby(self.era_col):
            series = self._neutralize_series(self._normalize_uniform(x[pred_col]), (x[other_col]))
            legacy_mc_scores.append(np.cov(series, x[target_col])[0, 1] / COVARIANCE_FACTOR)

        return legacy_mc_scores

    def contributive_correlation(self, dataf: pd.DataFrame, pred_col: str, target_col: str, other_col: str) -> np.array:
        """Calculate the contributive correlation of predictions
         with respect to the meta model.
         More information: https://docs.numer.ai/numerai-tournament/scoring/meta-model-contribution-mmc-and-bmc

         Uses Numerai's official scoring function for contribution under the hood.
         More information: https://github.com/numerai/numerai-tools/blob/master/numerai_tools/scoring.py

         Calculate contributive correlation by:
         1. tie-kept ranking each prediction and the meta model
         2. gaussianizing each prediction and the meta model
         3. orthogonalizing each prediction wrt the meta model
         3.5. scaling the targets to buckets [-2, -1, 0, 1, 2]
         4. dot product the orthogonalized predictions and the targets
        then normalize by the length of the target (equivalent to covariance)

         :param dataf: DataFrame containing era_col, pred_col, target_col and other_col.
         :param pred_col: Prediction column to calculate MMC for.
         :param target_col: Target column to calculate MMC against.
         Make sure the range of targets is [0, 1] (inclusive).
         If the function is called from full_evalation, this is guaranteed because of the checks.
         :param other_col: Meta model column containing predictions to neutralize against.

         :return: A 1D NumPy array of contributive correlations by era.
        """
        mc_scores = []
        for _, x in dataf.groupby(self.era_col):
            mc = nt_scoring.correlation_contribution(x[[pred_col]], x[other_col], x[target_col])
            mc_scores.append(mc)
        return np.array(mc_scores).ravel()

    def churn(self, dataf: pd.DataFrame, pred_col: str, target_col: str, tb: int = None) -> np.float64:
        """
        Describes how the alpha scores of a signal changes over time.
        More information: https://forum.numer.ai/t/better-lgbm-params-signals-v2-data-and-reducing-signals-churn/7638#p-17958-reducing-signals-churn-3

        Uses Numerai's official scoring function for churn under the hood.
        More information: https://github.com/numerai/numerai-tools/blob/575ae46c97e66bb6d7258803a2f4196b93cb99e8/numerai_tools/signals.py#L12
        :param dataf: DataFrame containing era_col, pred_col and target_col.
        :param pred_col: Prediction column to calculate churn for.
        :param target_col: Target column to calculate churn against.
        :param tb: How many of top and bottom predictions to focus on.
        For example, tb200_churn -> tb=200. tb500_churn -> tb=500. By default all predictions are considered.
        :return: Churn score for the given prediction column.
        """
        churn_scores = []
        for _, x in dataf.groupby(self.era_col):
            churn_score = nt_signals.churn(s1=x[pred_col], s2=x[target_col], top_bottom=tb)
            churn_scores.append(churn_score)
        return np.nanmean(churn_scores)

    def check_custom_functions(self):
        if not isinstance(self.custom_functions, dict):
            raise ValueError("custom_functions must be a dictionary")

        for func_name, func_info in self.custom_functions.items():
            if not isinstance(func_info, dict) or "func" not in func_info or "args" not in func_info:
                raise ValueError(f"Function {func_name} must have a 'func' and 'args' key")

            if not callable(func_info["func"]):
                raise ValueError(f"The 'func' value for '{func_name}' in custom_functions must be a callable function.")

            if not isinstance(func_info["args"], dict):
                raise ValueError(f"'args' for '{func_name}' in custom_functions must be a dictionary")

            if "local_args" in func_info:
                if not isinstance(func_info["local_args"], list):
                    raise ValueError(f"The 'local_args' key for {func_name} in custom_functionsmust be a list")
                for local_arg in func_info["local_args"]:
                    if not isinstance(local_arg, str):
                        raise ValueError(f"Local arg '{local_arg}' for '{func_name}' in custom_functions must be string.")
                    if local_arg not in list(func_info["args"].keys()):
                        raise ValueError(f"Local arg '{local_arg}' for '{func_name}' in custom_functions was not found in 'args'")

    def plot_correlations(
        self,
        dataf: pd.DataFrame,
        pred_cols: List[str],
        corr_cols: list = None,
        target_col: str = "target",
        roll_mean: int = 20,
    ):
        """
        Plot per era correlations over time.
        :param dataf: DataFrame that contains at least all pred_cols, target_col and corr_cols.
        :param pred_cols: List of prediction columns to calculate per era correlations for and plot.
        :param corr_cols: Per era correlations already prepared to include in the plot.
        This is optional for if you already have per era correlations prepared in your input dataf.
        :param target_col: Target column name to compute per era correlations against.
        :param roll_mean: How many eras should be averaged to compute a rolling score.
        """
        validation_by_eras = pd.DataFrame()
        # Compute per era correlation for each prediction column.
        for pred_col in pred_cols:
            per_era_corrs = self.per_era_numerai_corrs(dataf, pred_col=pred_col, target_col=target_col)
            validation_by_eras.loc[:, pred_col] = per_era_corrs

        # Add prepared per era correlation if any.
        if corr_cols is not None:
            for corr_col in corr_cols:
                validation_by_eras.loc[:, corr_col] = dataf[corr_col]

        validation_by_eras.rolling(roll_mean).mean().plot(
            kind="line",
            marker="o",
            ms=4,
            title=f"Rolling Per Era Correlation Mean (rolling window size: {roll_mean})",
            figsize=(15, 5),
        )
        plt.legend(
            loc="upper center",
            bbox_to_anchor=(0.5, -0.05),
            fancybox=True,
            shadow=True,
            ncol=1,
        )
        plt.axhline(y=0.0, color="r", linestyle="--")
        plt.show()

        validation_by_eras.cumsum().plot(title="Cumulative Sum of Era Correlations", figsize=(15, 5))
        plt.legend(
            loc="upper center",
            bbox_to_anchor=(0.5, -0.05),
            fancybox=True,
            shadow=True,
            ncol=1,
        )
        plt.axhline(y=0.0, color="r", linestyle="--")
        plt.show()
        return

    @staticmethod
    def plot_correlation_heatmap(dataf: pd.DataFrame, pred_cols: List[str]):
        corr_matrix = dataf[pred_cols].corr().to_numpy()

        plt.figure(figsize=(20, 20))

        # Create heatmap
        plt.imshow(corr_matrix, cmap="coolwarm", interpolation="none")
        plt.colorbar()

        # Add ticks and labels
        ticks = np.arange(0, len(pred_cols), 1)
        plt.xticks(ticks, pred_cols, rotation=90, fontsize=8)
        plt.yticks(ticks, pred_cols, fontsize=8)

        plt.show()
        return

    @staticmethod
    def _check_column_range(obj) -> bool:
        """
        Check that values are between 0 and 1 (inclusive).
        :param obj: Pandas DataFrame or Series.
        """
        min_val = obj.min().min() if isinstance(obj, pd.DataFrame) else obj.min()
        max_val = obj.max().max() if isinstance(obj, pd.DataFrame) else obj.max()
        return min_val >= 0 and max_val <= 1

`_check_column_range(obj)` `staticmethod`

Check that values are between 0 and 1 (inclusive). :param obj: Pandas DataFrame or Series.

Source code in src/numerblox/evaluation.py

@staticmethod
def _check_column_range(obj) -> bool:
    """
    Check that values are between 0 and 1 (inclusive).
    :param obj: Pandas DataFrame or Series.
    """
    min_val = obj.min().min() if isinstance(obj, pd.DataFrame) else obj.min()
    max_val = obj.max().max() if isinstance(obj, pd.DataFrame) else obj.max()
    return min_val >= 0 and max_val <= 1

`_normalize_uniform(df, method='first')` `staticmethod`

Normalize predictions uniformly using ranks. NOTE: Make sure the range of predictions is [0, 1] (inclusive).

Source code in src/numerblox/evaluation.py

@staticmethod
def _normalize_uniform(df: pd.DataFrame, method: str = "first") -> pd.Series:
    """
    Normalize predictions uniformly using ranks.
    NOTE: Make sure the range of predictions is [0, 1] (inclusive).
    """
    x = (df.rank(method=method) - 0.5) / len(df)  # TODO: Evaluate if subtracting df.mean() is better
    return pd.Series(x, index=df.index)

`_orthogonalize(v, u)` `staticmethod`

Orthogonalizes v with respect to u by projecting v onto u, then subtracting that projection from v.

This will reach the same result as the neutralize function when v and u are single column vectors, but this is much faster.

Parameters:

Name	Type	Description	Default
`v`	`ndarray`	np.ndarray - the vector to orthogonalize	required
`u`	`ndarray`	np.ndarray - the vector to orthogonalize v against	required

Returns:

Type	Description
`ndarray`	np.ndarray - the orthogonalized vector v

Source code in src/numerblox/evaluation.py

@staticmethod
def _orthogonalize(v: np.ndarray, u: np.ndarray) -> np.ndarray:
    """Orthogonalizes v with respect to u by projecting v onto u,
    then subtracting that projection from v.

    This will reach the same result as the neutralize function when v and u
    are single column vectors, but this is much faster.

    Arguments:
        v: np.ndarray - the vector to orthogonalize
        u: np.ndarray - the vector to orthogonalize v against

    Returns:
        np.ndarray - the orthogonalized vector v
    """
    # Calculate the dot product of u and v
    dot_product = u.T @ v

    # Calculate the projection of v onto u
    projection = (dot_product / (u.T @ u)) * u

    # Subtract the projection from v
    return v - projection

`_score_by_date(dataf, columns, target, tb=None)`

Get era correlation based on given TB (x top and bottom predictions). :param tb: How many of top and bottom predictions to focus on. TB200 is the most common situation.

Source code in src/numerblox/evaluation.py

def _score_by_date(self, dataf: pd.DataFrame, columns: list, target: str, tb: int = None):
    """
    Get era correlation based on given TB (x top and bottom predictions).
    :param tb: How many of top and bottom predictions to focus on.
    TB200 is the most common situation.
    """
    unique_eras = dataf[self.era_col].unique()
    computed = []
    for u in unique_eras:
        df_era = dataf[dataf[self.era_col] == u]
        era_pred = np.float64(df_era[columns].values.T)
        era_target = np.float64(df_era[target].values.T)

        if tb is None:
            ccs = np.corrcoef(era_target, era_pred)[0, 1:]
        else:
            tbidx = np.argsort(era_pred, axis=1)
            tbidx = np.concatenate([tbidx[:, :tb], tbidx[:, -tb:]], axis=1)
            ccs = [np.corrcoef(era_target[idx], pred[idx])[0, 1] for idx, pred in zip(tbidx, era_pred)]
            ccs = np.array(ccs)
        computed.append(ccs)
    return pd.DataFrame(np.array(computed), columns=columns, index=dataf[self.era_col].unique())

`apy(era_corrs, stake_compounding_lag=4)` `staticmethod`

Annual percentage yield. :param era_corrs: Correlation scores by era :param stake_compounding_lag: Compounding lag for Numerai rounds (4 for Numerai Classic)

Source code in src/numerblox/evaluation.py

@staticmethod
def apy(era_corrs: pd.Series, stake_compounding_lag: int = 4) -> np.float64:
    """
    Annual percentage yield.
    :param era_corrs: Correlation scores by era
    :param stake_compounding_lag: Compounding lag for Numerai rounds (4 for Numerai Classic)
    """
    payout_scores = era_corrs.clip(-0.25, 0.25)
    payout_product = (payout_scores + 1).prod()
    return (
        payout_product
        ** (
            # 52 weeks of compounding minus n for stake compounding lag
            (52 - stake_compounding_lag) / len(payout_scores)
        )
        - 1
    ) * 100

`autocorr1(era_corrs)`

1st order autocorrelation. :param era_corrs: Correlation scores by era.

Source code in src/numerblox/evaluation.py

def autocorr1(self, era_corrs: pd.Series) -> np.float64:
    """
    1st order autocorrelation.
    :param era_corrs: Correlation scores by era.
    """
    return np.corrcoef(era_corrs[:-1], era_corrs[1:])[0, 1]

`autocorr_penalty(era_corrs)`

Adjusting factor for autocorrelation. Used in Smart Sharpe. :param era_corrs: Correlation scores by era.

Source code in src/numerblox/evaluation.py

def autocorr_penalty(self, era_corrs: pd.Series) -> np.float64:
    """
    Adjusting factor for autocorrelation. Used in Smart Sharpe.
    :param era_corrs: Correlation scores by era.
    """
    n = len(era_corrs)
    # 1st order autocorrelation
    p = self.autocorr1(era_corrs)
    return np.sqrt(1 + 2 * np.sum([((n - i) / n) * p**i for i in range(1, n)]))

`churn(dataf, pred_col, target_col, tb=None)`

Describes how the alpha scores of a signal changes over time. More information: https://forum.numer.ai/t/better-lgbm-params-signals-v2-data-and-reducing-signals-churn/7638#p-17958-reducing-signals-churn-3

Uses Numerai's official scoring function for churn under the hood. More information: https://github.com/numerai/numerai-tools/blob/575ae46c97e66bb6d7258803a2f4196b93cb99e8/numerai_tools/signals.py#L12 :param dataf: DataFrame containing era_col, pred_col and target_col. :param pred_col: Prediction column to calculate churn for. :param target_col: Target column to calculate churn against. :param tb: How many of top and bottom predictions to focus on. For example, tb200_churn -> tb=200. tb500_churn -> tb=500. By default all predictions are considered. :return: Churn score for the given prediction column.

Source code in src/numerblox/evaluation.py

def churn(self, dataf: pd.DataFrame, pred_col: str, target_col: str, tb: int = None) -> np.float64:
    """
    Describes how the alpha scores of a signal changes over time.
    More information: https://forum.numer.ai/t/better-lgbm-params-signals-v2-data-and-reducing-signals-churn/7638#p-17958-reducing-signals-churn-3

    Uses Numerai's official scoring function for churn under the hood.
    More information: https://github.com/numerai/numerai-tools/blob/575ae46c97e66bb6d7258803a2f4196b93cb99e8/numerai_tools/signals.py#L12
    :param dataf: DataFrame containing era_col, pred_col and target_col.
    :param pred_col: Prediction column to calculate churn for.
    :param target_col: Target column to calculate churn against.
    :param tb: How many of top and bottom predictions to focus on.
    For example, tb200_churn -> tb=200. tb500_churn -> tb=500. By default all predictions are considered.
    :return: Churn score for the given prediction column.
    """
    churn_scores = []
    for _, x in dataf.groupby(self.era_col):
        churn_score = nt_signals.churn(s1=x[pred_col], s2=x[target_col], top_bottom=tb)
        churn_scores.append(churn_score)
    return np.nanmean(churn_scores)

`contributive_correlation(dataf, pred_col, target_col, other_col)`

Calculate the contributive correlation of predictions with respect to the meta model. More information: https://docs.numer.ai/numerai-tournament/scoring/meta-model-contribution-mmc-and-bmc

Uses Numerai's official scoring function for contribution under the hood. More information: https://github.com/numerai/numerai-tools/blob/master/numerai_tools/scoring.py

Calculate contributive correlation by: 1. tie-kept ranking each prediction and the meta model 2. gaussianizing each prediction and the meta model 3. orthogonalizing each prediction wrt the meta model 3.5. scaling the targets to buckets [-2, -1, 0, 1, 2] 4. dot product the orthogonalized predictions and the targets then normalize by the length of the target (equivalent to covariance)

:param dataf: DataFrame containing era_col, pred_col, target_col and other_col. :param pred_col: Prediction column to calculate MMC for. :param target_col: Target column to calculate MMC against. Make sure the range of targets is [0, 1] (inclusive). If the function is called from full_evalation, this is guaranteed because of the checks. :param other_col: Meta model column containing predictions to neutralize against.

:return: A 1D NumPy array of contributive correlations by era.

Source code in src/numerblox/evaluation.py

def contributive_correlation(self, dataf: pd.DataFrame, pred_col: str, target_col: str, other_col: str) -> np.array:
    """Calculate the contributive correlation of predictions
     with respect to the meta model.
     More information: https://docs.numer.ai/numerai-tournament/scoring/meta-model-contribution-mmc-and-bmc

     Uses Numerai's official scoring function for contribution under the hood.
     More information: https://github.com/numerai/numerai-tools/blob/master/numerai_tools/scoring.py

     Calculate contributive correlation by:
     1. tie-kept ranking each prediction and the meta model
     2. gaussianizing each prediction and the meta model
     3. orthogonalizing each prediction wrt the meta model
     3.5. scaling the targets to buckets [-2, -1, 0, 1, 2]
     4. dot product the orthogonalized predictions and the targets
    then normalize by the length of the target (equivalent to covariance)

     :param dataf: DataFrame containing era_col, pred_col, target_col and other_col.
     :param pred_col: Prediction column to calculate MMC for.
     :param target_col: Target column to calculate MMC against.
     Make sure the range of targets is [0, 1] (inclusive).
     If the function is called from full_evalation, this is guaranteed because of the checks.
     :param other_col: Meta model column containing predictions to neutralize against.

     :return: A 1D NumPy array of contributive correlations by era.
    """
    mc_scores = []
    for _, x in dataf.groupby(self.era_col):
        mc = nt_scoring.correlation_contribution(x[[pred_col]], x[other_col], x[target_col])
        mc_scores.append(mc)
    return np.array(mc_scores).ravel()

`cross_correlation(dataf, pred_col, other_col)`

Corrv2 correlation with other predictions (like another model, example predictions or meta model prediction). :param dataf: DataFrame containing both pred_col and other_col. :param pred_col: Main Prediction. :param other_col: Other prediction column to calculate correlation with pred_col.

:return: Correlation between Corrv2's of pred_col and other_col.

Source code in src/numerblox/evaluation.py

def cross_correlation(self, dataf: pd.DataFrame, pred_col: str, other_col: str):
    """
    Corrv2 correlation with other predictions (like another model, example predictions or meta model prediction).
    :param dataf: DataFrame containing both pred_col and other_col.
    :param pred_col: Main Prediction.
    :param other_col: Other prediction column to calculate correlation with pred_col.

    :return: Correlation between Corrv2's of pred_col and other_col.
    """
    return self.per_era_numerai_corrs(
        dataf=dataf,
        pred_col=pred_col,
        target_col=other_col,
    ).mean()

`evaluation_one_col(dataf, feature_cols, pred_col, target_col, benchmark_cols=None)`

Perform evaluation for one prediction column against given target and benchmark column(s).

Source code in src/numerblox/evaluation.py

def evaluation_one_col(
    self,
    dataf: pd.DataFrame,
    feature_cols: list,
    pred_col: str,
    target_col: str,
    benchmark_cols: list = None,
):
    """
    Perform evaluation for one prediction column
    against given target and benchmark column(s).
    """
    # Era column check
    assert self.era_col in dataf.columns, f"Era column '{self.era_col}' not found in DataFrame. Make sure to set the correct era_col."
    # Prediction column check
    assert pred_col in dataf.columns, f"Prediction column '{pred_col}' not found in DataFrame. Make sure to set the correct pred_col."
    assert self._check_column_range(dataf[pred_col]), "All predictions should be between 0 and 1 (inclusive)."
    # Target column check
    assert target_col in dataf.columns, f"Target column '{target_col}' not found in DataFrame. Make sure to set the correct target_col."
    assert self._check_column_range(dataf[target_col]), "All targets should be between 0 and 1 (inclusive)."
    # Benchmark columns check
    if benchmark_cols is not None:
        for col in benchmark_cols:
            assert col in dataf.columns, f"Benchmark column '{col}' not found in DataFrame. Make sure to set the correct benchmark_cols."
            assert self._check_column_range(dataf[col]), f"All predictions for '{col}' should be between 0 and 1 (inclusive)."

    # Set up progress bar
    if self.show_detailed_progress_bar:
        len_custom_functions = 0 if self.custom_functions is None else len(list(self.custom_functions.keys()))
        len_pbar = len(self.metrics_list) + len_custom_functions
        pbar = tqdm(total=len_pbar, desc="Evaluation")
    else:
        pbar = None

    col_stats = {}
    col_stats["target"] = target_col

    # Compute stats per era (only if needed)
    per_era_numerai_corrs = self.per_era_numerai_corrs(dataf=dataf, pred_col=pred_col, target_col=target_col)

    # check if mean, std, or sharpe are in metrics_list
    if "mean_std_sharpe" in self.metrics_list:
        self.update_progress_bar(pbar, desc="mean_std_sharpe for evaluation")
        mean, std, sharpe = self.mean_std_sharpe(era_corrs=per_era_numerai_corrs)
        col_stats["mean"] = mean
        col_stats["std"] = std
        col_stats["sharpe"] = sharpe

    if "legacy_mean_std_sharpe" in self.metrics_list:
        self.update_progress_bar(pbar, desc="legacy_mean_std_sharpe for evaluation")
        per_era_corrs = self.per_era_corrs(dataf=dataf, pred_col=pred_col, target_col=target_col)
        legacy_mean, legacy_std, legacy_sharpe = self.mean_std_sharpe(era_corrs=per_era_corrs)
        col_stats["legacy_mean"] = legacy_mean
        col_stats["legacy_std"] = legacy_std
        col_stats["legacy_sharpe"] = legacy_sharpe

    if "max_drawdown" in self.metrics_list:
        self.update_progress_bar(pbar, desc="max_drawdown for evaluation")
        col_stats["max_drawdown"] = self.max_drawdown(era_corrs=per_era_numerai_corrs)

    if "apy" in self.metrics_list:
        self.update_progress_bar(pbar, desc="apy for evaluation")
        col_stats["apy"] = self.apy(era_corrs=per_era_numerai_corrs)

    if "calmar_ratio" in self.metrics_list:
        self.update_progress_bar(pbar, desc="calmar_ratio for evaluation")
        # Max drawdown and APY need for calmar ratio calculation
        if "max_drawdown" not in self.metrics_list:
            col_stats["max_drawdown"] = self.max_drawdown(era_corrs=per_era_numerai_corrs)
        if "apy" not in self.metrics_list:
            col_stats["apy"] = self.apy(era_corrs=per_era_numerai_corrs)
        col_stats["calmar_ratio"] = np.nan if col_stats["max_drawdown"] == 0 else col_stats["apy"] / -col_stats["max_drawdown"]

    if "autocorrelation" in self.metrics_list:
        self.update_progress_bar(pbar, desc="autocorrelation for evaluation")
        col_stats["autocorrelation"] = self.autocorr1(per_era_numerai_corrs)

    if "max_feature_exposure" in self.metrics_list:
        self.update_progress_bar(pbar, desc="max_feature_exposure for evaluation")
        col_stats["max_feature_exposure"] = self.max_feature_exposure(dataf=dataf, feature_cols=feature_cols, pred_col=pred_col)

    if "smart_sharpe" in self.metrics_list:
        self.update_progress_bar(pbar, desc="smart_sharpe for evaluation")
        col_stats["smart_sharpe"] = self.smart_sharpe(era_corrs=per_era_numerai_corrs)

    if "churn" in self.metrics_list:
        self.update_progress_bar(pbar, desc="churn for evaluation")
        col_stats["churn"] = self.churn(dataf=dataf, pred_col=pred_col, target_col=target_col, tb=None)
    if "tb200_churn" in self.metrics_list:
        self.update_progress_bar(pbar, desc="tb200_churn for evaluation")
        col_stats["tb200_churn"] = self.churn(dataf=dataf, pred_col=pred_col, target_col=target_col, tb=200)
    if "tb500_churn" in self.metrics_list:
        self.update_progress_bar(pbar, desc="tb500_churn for evaluation")
        col_stats["tb500_churn"] = self.churn(dataf=dataf, pred_col=pred_col, target_col=target_col, tb=500)

    if benchmark_cols is not None:
        for bench_col in benchmark_cols:
            per_era_bench_corrs = self.per_era_numerai_corrs(dataf=dataf, pred_col=bench_col, target_col=target_col)

            if "mean_std_sharpe" in self.metrics_list:
                bench_mean, bench_std, bench_sharpe = self.mean_std_sharpe(era_corrs=per_era_bench_corrs)
                col_stats[f"mean_vs_{bench_col}"] = mean - bench_mean
                col_stats[f"std_vs_{bench_col}"] = std - bench_std
                col_stats[f"sharpe_vs_{bench_col}"] = sharpe - bench_sharpe

            if "mc_mean_std_sharpe" in self.metrics_list:
                mc_scores = self.contributive_correlation(
                    dataf=dataf,
                    pred_col=pred_col,
                    target_col=target_col,
                    other_col=bench_col,
                )
                col_stats[f"mc_mean_{bench_col}"] = np.nanmean(mc_scores)
                col_stats[f"mc_std_{bench_col}"] = np.nanstd(mc_scores)
                col_stats[f"mc_sharpe_{bench_col}"] = np.nan if col_stats[f"mc_std_{bench_col}"] == 0 else col_stats[f"mc_mean_{bench_col}"] / col_stats[f"mc_std_{bench_col}"]

            if "corr_with" in self.metrics_list:
                col_stats[f"corr_with_{bench_col}"] = self.cross_correlation(dataf=dataf, pred_col=bench_col, other_col=bench_col)

            if "legacy_mc_mean_std_sharpe" in self.metrics_list:
                legacy_mc_scores = self.legacy_contribution(
                    dataf=dataf,
                    pred_col=pred_col,
                    target_col=target_col,
                    other_col=bench_col,
                )
                col_stats[f"legacy_mc_mean_{bench_col}"] = np.nanmean(legacy_mc_scores)
                col_stats[f"legacy_mc_std_{bench_col}"] = np.nanstd(legacy_mc_scores)
                col_stats[f"legacy_mc_sharpe_{bench_col}"] = np.nan if col_stats[f"legacy_mc_std_{bench_col}"] == 0 else col_stats[f"legacy_mc_mean_{bench_col}"] / col_stats[f"legacy_mc_std_{bench_col}"]

            if "ex_diss" in self.metrics_list or "ex_diss_pearson" in self.metrics_list:
                col_stats[f"exposure_dissimilarity_pearson_{bench_col}"] = self.exposure_dissimilarity(dataf=dataf, pred_col=pred_col, other_col=bench_col, corr_method="pearson")
            if "ex_diss_spearman" in self.metrics_list:
                col_stats[f"exposure_dissimilarity_spearman_{bench_col}"] = self.exposure_dissimilarity(dataf=dataf, pred_col=pred_col, other_col=bench_col, corr_method="spearman")
        self.update_progress_bar(pbar, desc=f"Finished evaluation for '{self.n_bench_metrics}' benchmark metrics", update=self.n_bench_metrics)

    # Compute intensive stats
    if "fn_mean_std_sharpe" in self.metrics_list:
        self.update_progress_bar(pbar, desc="fn_mean_std_sharpe for evaluation")
        fn_mean, fn_std, fn_sharpe = self.feature_neutral_mean_std_sharpe(
            dataf=dataf,
            pred_col=pred_col,
            target_col=target_col,
            feature_names=feature_cols,
        )
        col_stats["feature_neutral_mean"] = fn_mean
        col_stats["feature_neutral_std"] = fn_std
        col_stats["feature_neutral_sharpe"] = fn_sharpe

    if "tb200_mean_std_sharpe" in self.metrics_list:
        self.update_progress_bar(pbar, desc="tb200_mean_std_sharpe for evaluation")
        tb200_mean, tb200_std, tb200_sharpe = self.tbx_mean_std_sharpe(dataf=dataf, pred_col=pred_col, target_col=target_col, tb=200)
        col_stats["tb200_mean"] = tb200_mean
        col_stats["tb200_std"] = tb200_std
        col_stats["tb200_sharpe"] = tb200_sharpe

    if "tb500_mean_std_sharpe" in self.metrics_list:
        self.update_progress_bar(pbar, desc="tb500_mean_std_sharpe for evaluation")
        tb500_mean, tb500_std, tb500_sharpe = self.tbx_mean_std_sharpe(dataf=dataf, pred_col=pred_col, target_col=target_col, tb=500)
        col_stats["tb500_mean"] = tb500_mean
        col_stats["tb500_std"] = tb500_std
        col_stats["tb500_sharpe"] = tb500_sharpe

    # Custom functions
    if self.custom_functions is not None:
        local_vars = locals()
        for func_name, func_info in self.custom_functions.items():
            self.update_progress_bar(pbar, desc=f"custom function: '{func_name}' for evaluation")

            func = func_info["func"]
            args = func_info["args"]
            local_args = func_info["local_args"]
            resolved_args = {}
            for k, v in args.items():
                # Resolve variables defined as local args
                if isinstance(v, str) and v in local_args:
                    if v not in local_vars:
                        raise ValueError(f"Variable '{v}' was defined in 'local_args', but was not found in local variables. Make sure to set the correct local_args.")
                    else:
                        resolved_args[k] = local_vars[v]
                else:
                    resolved_args[k] = v
            col_stats[func_name] = func(**resolved_args)

    col_stats_df = pd.DataFrame(col_stats, index=[pred_col])
    self.update_progress_bar(pbar, desc="Finished evaluation", close=True)
    return col_stats_df

`exposure_dissimilarity(dataf, pred_col, other_col, corr_method='pearson')`

Model pattern of feature exposure to the another column. See TC details forum post: https://forum.numer.ai/t/true-contribution-details/5128/4 :param dataf: DataFrame containing both pred_col and other_col. :param pred_col: Main Prediction. :param other_col: Other prediction column to calculate exposure dissimilarity against. :param corr_method: Correlation method to use for calculating feature exposures. corr_method should be one of ['pearson', 'kendall', 'spearman']. Default: 'pearson'.

Source code in src/numerblox/evaluation.py

def exposure_dissimilarity(self, dataf: pd.DataFrame, pred_col: str, other_col: str, corr_method: str = "pearson") -> np.float32:
    """
    Model pattern of feature exposure to the another column.
    See TC details forum post: https://forum.numer.ai/t/true-contribution-details/5128/4
    :param dataf: DataFrame containing both pred_col and other_col.
    :param pred_col: Main Prediction.
    :param other_col: Other prediction column to calculate exposure dissimilarity against.
    :param corr_method: Correlation method to use for calculating feature exposures.
    corr_method should be one of ['pearson', 'kendall', 'spearman']. Default: 'pearson'.
    """
    assert corr_method in ["pearson", "kendall", "spearman"], f"corr_method should be one of ['pearson', 'kendall', 'spearman']. Got: '{corr_method}'"
    feature_cols = [col for col in dataf.columns if col.startswith("feature")]
    U = dataf[feature_cols].corrwith(dataf[pred_col], method=corr_method).values
    E = dataf[feature_cols].corrwith(dataf[other_col], method=corr_method).values

    denominator = np.dot(E, E)
    if denominator == 0:
        exp_dis = 0
    else:
        exp_dis = 1 - np.dot(U, E) / denominator
    return exp_dis

`feature_neutral_mean_std_sharpe(dataf, pred_col, target_col, feature_names)`

Feature neutralized mean performance. More info: https://docs.numer.ai/tournament/feature-neutral-correlation

Source code in src/numerblox/evaluation.py

def feature_neutral_mean_std_sharpe(self, dataf: pd.DataFrame, pred_col: str, target_col: str, feature_names: list) -> Tuple[np.float64, np.float64, np.float64]:
    """
    Feature neutralized mean performance.
    More info: https://docs.numer.ai/tournament/feature-neutral-correlation
    """
    fn = FeatureNeutralizer(pred_name=pred_col, proportion=1.0)
    fn.set_predict_request(features=True, era_series=True)
    neutralized_preds = fn.predict(dataf[pred_col], features=dataf[feature_names], era_series=dataf[self.era_col])
    # Construct new DataFrame with era col, target col and preds
    neutralized_dataf = pd.DataFrame(columns=[self.era_col, target_col, pred_col])
    neutralized_dataf[self.era_col] = dataf[self.era_col]
    neutralized_dataf[target_col] = dataf[target_col]
    neutralized_dataf[pred_col] = neutralized_preds

    neutral_corrs = self.per_era_numerai_corrs(
        dataf=neutralized_dataf,
        pred_col=pred_col,
        target_col=target_col,
    )
    mean, std, sharpe = self.mean_std_sharpe(era_corrs=neutral_corrs)
    return mean, std, sharpe

`full_evaluation(dataf, pred_cols, target_col='target', benchmark_cols=None)`

Perform evaluation for each prediction column in pred_cols. By default only the "prediction" column is evaluated. Evaluation is done against given target and benchmark prediction column. :param dataf: DataFrame containing era_col, pred_cols, target_col and optional benchmark_cols. :param pred_cols: List of prediction columns to calculate evaluation metrics for. :param target_col: Target column to evaluate against. :param benchmark_cols: Optional list of benchmark columns to calculate evaluation metrics for.

Source code in src/numerblox/evaluation.py

def full_evaluation(
    self,
    dataf: pd.DataFrame,
    pred_cols: List[str],
    target_col: str = "target",
    benchmark_cols: list = None,
) -> pd.DataFrame:
    """
    Perform evaluation for each prediction column in pred_cols.
    By default only the "prediction" column is evaluated.
    Evaluation is done against given target and benchmark prediction column.
    :param dataf: DataFrame containing era_col, pred_cols, target_col and optional benchmark_cols.
    :param pred_cols: List of prediction columns to calculate evaluation metrics for.
    :param target_col: Target column to evaluate against.
    :param benchmark_cols: Optional list of benchmark columns to calculate evaluation metrics for.
    """
    val_stats = pd.DataFrame()
    feature_cols = [col for col in dataf.columns if col.startswith("feature")]
    cat_cols = dataf[feature_cols].select_dtypes(include=["category"]).columns.to_list()
    if cat_cols:
        print(f"WARNING: Categorical features detected that cannot be used for neutralization. Removing columns: '{cat_cols}' for evaluation.")
        dataf.loc[:, feature_cols] = dataf[feature_cols].select_dtypes(exclude=["category"])
    dataf = dataf.fillna(0.5)
    for col in tqdm(pred_cols, desc="Evaluation: "):
        col_stats = self.evaluation_one_col(
            dataf=dataf,
            pred_col=col,
            feature_cols=feature_cols,
            target_col=target_col,
            benchmark_cols=benchmark_cols,
        )
        val_stats = pd.concat([val_stats, col_stats], axis=0)
    return val_stats

`get_feature_exposures_corrv2(dataf, pred_col, feature_list, cpu_cores=-1)`

Calculate feature exposures for each era using 'Numerai Corr'. Results will be similar to get_feature_exposures() but more accurate. This method will take longer to compute.

:param dataf: DataFrame containing predictions, features, and eras. :param pred_col: Prediction column to calculate feature exposures for. :param feature_list: List of feature columns in X. :param cpu_cores: Number of CPU cores to use for parallelization. Default: -1 (all cores). :return: DataFrame with Corrv2 feature exposures by era for each feature.

Source code in src/numerblox/evaluation.py

def get_feature_exposures_corrv2(
    self,
    dataf: pd.DataFrame,
    pred_col: str,
    feature_list: List[str],
    cpu_cores: int = -1,
) -> pd.DataFrame:
    """
    Calculate feature exposures for each era using 'Numerai Corr'.
    Results will be similar to get_feature_exposures() but more accurate.
    This method will take longer to compute.

    :param dataf: DataFrame containing predictions, features, and eras.
    :param pred_col: Prediction column to calculate feature exposures for.
    :param feature_list: List of feature columns in X.
    :param cpu_cores: Number of CPU cores to use for parallelization.
    Default: -1 (all cores).
    :return: DataFrame with Corrv2 feature exposures by era for each feature.
    """

    def calculate_era_feature_exposure(era, group, pred_col, feature_list):
        exposures = {}
        for feature in feature_list:
            corr = self.numerai_corr(group, pred_col=f"{pred_col}_normalized", target_col=feature)
            exposures[feature] = corr
        return era, exposures

    normalized_ranks = (dataf[[pred_col]].rank(method="first") - 0.5) / len(dataf)
    dataf[f"{pred_col}_normalized"] = stats.norm.ppf(normalized_ranks)
    feature_exposure_data = pd.DataFrame(index=dataf["era"].unique(), columns=feature_list)

    grouped_data = list(dataf.groupby("era"))

    results = Parallel(n_jobs=cpu_cores)(delayed(calculate_era_feature_exposure)(era, group, pred_col, feature_list) for era, group in grouped_data)
    for era, exposures in results:
        feature_exposure_data.loc[era, :] = exposures
    return feature_exposure_data

`get_feature_exposures_pearson(dataf, pred_col, feature_list, cpu_cores=-1)`

Calculate feature exposures for each era using Pearson correlation.

:param dataf: DataFrame containing predictions, features, and eras. :param pred_col: Prediction column to calculate feature exposures for. :param feature_list: List of feature columns in X. :param cpu_cores: Number of CPU cores to use for parallelization. :return: DataFrame with Pearson feature exposures by era for each feature.

Source code in src/numerblox/evaluation.py

def get_feature_exposures_pearson(
    self,
    dataf: pd.DataFrame,
    pred_col: str,
    feature_list: List[str],
    cpu_cores: int = -1,
) -> pd.DataFrame:
    """
    Calculate feature exposures for each era using Pearson correlation.

    :param dataf: DataFrame containing predictions, features, and eras.
    :param pred_col: Prediction column to calculate feature exposures for.
    :param feature_list: List of feature columns in X.
    :param cpu_cores: Number of CPU cores to use for parallelization.
    :return: DataFrame with Pearson feature exposures by era for each feature.
    """

    def calculate_era_pearson_exposure(era, group, feature_list, pred_col_normalized):
        data_matrix = group[feature_list + [pred_col_normalized]].values
        correlations = np.corrcoef(data_matrix, rowvar=False)

        # Get the correlations of all features with the predictions (which is the last column)
        feature_correlations = correlations[:-1, -1]
        return era, feature_correlations

    normalized_ranks = (dataf[[pred_col]].rank(method="first") - 0.5) / len(dataf)
    dataf[f"{pred_col}_normalized"] = stats.norm.ppf(normalized_ranks)
    feature_exposure_data = pd.DataFrame(index=dataf["era"].unique(), columns=feature_list)

    grouped_data = list(dataf.groupby("era"))

    results = Parallel(n_jobs=cpu_cores)(delayed(calculate_era_pearson_exposure)(era, group, feature_list, f"{pred_col}_normalized") for era, group in grouped_data)

    for era, feature_correlations in results:
        feature_exposure_data.loc[era, :] = feature_correlations
    return feature_exposure_data

`legacy_contribution(dataf, pred_col, target_col, other_col)`

Legacy contibution mean, standard deviation and sharpe ratio. More info: https://forum.numer.ai/t/mmc2-announcement/93

:param dataf: DataFrame containing era_col, pred_col, target_col and other_col. :param pred_col: Prediction column to calculate MMC for. :param target_col: Target column to calculate MMC against. :param other_col: Meta model column containing predictions to neutralize against.

:return: List of legacy contribution scores by era.

Source code in src/numerblox/evaluation.py

def legacy_contribution(self, dataf: pd.DataFrame, pred_col: str, target_col: str, other_col: str):
    """
    Legacy contibution mean, standard deviation and sharpe ratio.
    More info: https://forum.numer.ai/t/mmc2-announcement/93

    :param dataf: DataFrame containing era_col, pred_col, target_col and other_col.
    :param pred_col: Prediction column to calculate MMC for.
    :param target_col: Target column to calculate MMC against.
    :param other_col: Meta model column containing predictions to neutralize against.

    :return: List of legacy contribution scores by era.
    """
    legacy_mc_scores = []
    # Standard deviation of a uniform distribution
    COVARIANCE_FACTOR = 0.29**2
    # Calculate MMC for each era
    for _, x in dataf.groupby(self.era_col):
        series = self._neutralize_series(self._normalize_uniform(x[pred_col]), (x[other_col]))
        legacy_mc_scores.append(np.cov(series, x[target_col])[0, 1] / COVARIANCE_FACTOR)

    return legacy_mc_scores

`max_drawdown(era_corrs)` `staticmethod`

Maximum drawdown per era.

Source code in src/numerblox/evaluation.py

@staticmethod
def max_drawdown(era_corrs: pd.Series) -> np.float64:
    """Maximum drawdown per era."""
    # Arbitrarily large window
    rolling_max = (era_corrs + 1).cumprod().rolling(window=9000, min_periods=1).max()
    daily_value = (era_corrs + 1).cumprod()
    max_drawdown = -((rolling_max - daily_value) / rolling_max).max()
    return max_drawdown

`max_feature_exposure(dataf, feature_cols, pred_col)`

Maximum exposure over all features.

Source code in src/numerblox/evaluation.py

def max_feature_exposure(self, dataf: pd.DataFrame, feature_cols: List[str], pred_col: str) -> np.float64:
    """Maximum exposure over all features."""
    max_per_era = dataf.groupby(self.era_col).apply(lambda d: d[feature_cols].corrwith(d[pred_col]).abs().max())
    max_feature_exposure = max_per_era.mean(skipna=True)
    return max_feature_exposure

`mean_std_sharpe(era_corrs)`

Average, standard deviation and Sharpe ratio for correlations per era.

Source code in src/numerblox/evaluation.py

def mean_std_sharpe(self, era_corrs: pd.Series) -> Tuple[np.float64, np.float64, np.float64]:
    """
    Average, standard deviation and Sharpe ratio for
    correlations per era.
    """
    mean = pd.Series(era_corrs.mean()).item()
    std = pd.Series(era_corrs.std(ddof=0)).item()
    sharpe = np.nan if std == 0 else mean / std
    return mean, std, sharpe

`numerai_corr(dataf, pred_col, target_col)`

Computes 'Numerai Corr' aka 'Corrv2'. More info: https://forum.numer.ai/t/target-cyrus-new-primary-target/6303

Assumes original target col as input (i.e. in [0...1] range).

Source code in src/numerblox/evaluation.py

def numerai_corr(self, dataf: pd.DataFrame, pred_col: str, target_col: str) -> np.float64:
    """
    Computes 'Numerai Corr' aka 'Corrv2'.
    More info: https://forum.numer.ai/t/target-cyrus-new-primary-target/6303

    Assumes original target col as input (i.e. in [0...1] range).
    """
    # Rank and gaussianize predictions
    ranked_preds = self._normalize_uniform(dataf[pred_col].fillna(0.5), method="average")
    gauss_ranked_preds = stats.norm.ppf(ranked_preds)
    # Center target from [0...1] to [-0.5...0.5] range
    targets = dataf[target_col]
    centered_target = targets - targets.mean()
    # Accentuate tails of predictions and targets
    preds_p15 = np.sign(gauss_ranked_preds) * np.abs(gauss_ranked_preds) ** 1.5
    target_p15 = np.sign(centered_target) * np.abs(centered_target) ** 1.5
    # Pearson correlation
    corr, _ = stats.pearsonr(preds_p15, target_p15)
    return corr

`per_era_corrs(dataf, pred_col, target_col)`

Correlation between prediction and target for each era.

Source code in src/numerblox/evaluation.py

def per_era_corrs(self, dataf: pd.DataFrame, pred_col: str, target_col: str) -> pd.Series:
    """Correlation between prediction and target for each era."""
    return dataf.groupby(self.era_col).apply(lambda d: self._normalize_uniform(d[pred_col].fillna(0.5)).corr(d[target_col]))

`per_era_numerai_corrs(dataf, pred_col, target_col)`

Numerai Corr between prediction and target for each era.

Source code in src/numerblox/evaluation.py

def per_era_numerai_corrs(self, dataf: pd.DataFrame, pred_col: str, target_col: str) -> pd.Series:
    """Numerai Corr between prediction and target for each era."""
    return dataf.groupby(self.era_col).apply(lambda d: self.numerai_corr(d.fillna(0.5), pred_col, target_col))

`plot_correlations(dataf, pred_cols, corr_cols=None, target_col='target', roll_mean=20)`

Plot per era correlations over time. :param dataf: DataFrame that contains at least all pred_cols, target_col and corr_cols. :param pred_cols: List of prediction columns to calculate per era correlations for and plot. :param corr_cols: Per era correlations already prepared to include in the plot. This is optional for if you already have per era correlations prepared in your input dataf. :param target_col: Target column name to compute per era correlations against. :param roll_mean: How many eras should be averaged to compute a rolling score.

Source code in src/numerblox/evaluation.py

def plot_correlations(
    self,
    dataf: pd.DataFrame,
    pred_cols: List[str],
    corr_cols: list = None,
    target_col: str = "target",
    roll_mean: int = 20,
):
    """
    Plot per era correlations over time.
    :param dataf: DataFrame that contains at least all pred_cols, target_col and corr_cols.
    :param pred_cols: List of prediction columns to calculate per era correlations for and plot.
    :param corr_cols: Per era correlations already prepared to include in the plot.
    This is optional for if you already have per era correlations prepared in your input dataf.
    :param target_col: Target column name to compute per era correlations against.
    :param roll_mean: How many eras should be averaged to compute a rolling score.
    """
    validation_by_eras = pd.DataFrame()
    # Compute per era correlation for each prediction column.
    for pred_col in pred_cols:
        per_era_corrs = self.per_era_numerai_corrs(dataf, pred_col=pred_col, target_col=target_col)
        validation_by_eras.loc[:, pred_col] = per_era_corrs

    # Add prepared per era correlation if any.
    if corr_cols is not None:
        for corr_col in corr_cols:
            validation_by_eras.loc[:, corr_col] = dataf[corr_col]

    validation_by_eras.rolling(roll_mean).mean().plot(
        kind="line",
        marker="o",
        ms=4,
        title=f"Rolling Per Era Correlation Mean (rolling window size: {roll_mean})",
        figsize=(15, 5),
    )
    plt.legend(
        loc="upper center",
        bbox_to_anchor=(0.5, -0.05),
        fancybox=True,
        shadow=True,
        ncol=1,
    )
    plt.axhline(y=0.0, color="r", linestyle="--")
    plt.show()

    validation_by_eras.cumsum().plot(title="Cumulative Sum of Era Correlations", figsize=(15, 5))
    plt.legend(
        loc="upper center",
        bbox_to_anchor=(0.5, -0.05),
        fancybox=True,
        shadow=True,
        ncol=1,
    )
    plt.axhline(y=0.0, color="r", linestyle="--")
    plt.show()
    return

`smart_sharpe(era_corrs)`

Sharpe adjusted for autocorrelation. :param era_corrs: Correlation scores by era

Source code in src/numerblox/evaluation.py

def smart_sharpe(self, era_corrs: pd.Series) -> np.float64:
    """
    Sharpe adjusted for autocorrelation.
    :param era_corrs: Correlation scores by era
    """
    return np.nanmean(era_corrs) / (np.nanstd(era_corrs, ddof=1) * self.autocorr_penalty(era_corrs))

`tbx_mean_std_sharpe(dataf, pred_col, target_col, tb=200)`

Calculate Mean, Standard deviation and Sharpe ratio when we focus on the x top and x bottom predictions. :param tb: How many of top and bottom predictions to focus on. TB200 and TB500 are the most common situations.

Source code in src/numerblox/evaluation.py

def tbx_mean_std_sharpe(self, dataf: pd.DataFrame, pred_col: str, target_col: str, tb: int = 200) -> Tuple[np.float64, np.float64, np.float64]:
    """
    Calculate Mean, Standard deviation and Sharpe ratio
    when we focus on the x top and x bottom predictions.
    :param tb: How many of top and bottom predictions to focus on.
    TB200 and TB500 are the most common situations.
    """
    tb_val_corrs = self._score_by_date(dataf=dataf, columns=[pred_col], target=target_col, tb=tb)
    return self.mean_std_sharpe(era_corrs=tb_val_corrs)

`update_progress_bar(pbar, desc, update=1, close=False)`

Update progress bar for evaluation. :param pbar: tqdm progress bar object. :param desc: Description to show in progress bar. :param update: Update progress by n steps. :param close: Close progress bar.

Source code in src/numerblox/evaluation.py

def update_progress_bar(self, pbar, desc: str, update: int = 1, close: bool = False):
    """
    Update progress bar for evaluation.
    :param pbar: tqdm progress bar object.
    :param desc: Description to show in progress bar.
    :param update: Update progress by n steps.
    :param close: Close progress bar.
    """
    if self.show_detailed_progress_bar and pbar is not None:
        pbar.set_description_str(desc)
        pbar.update(update)
        if close:
            pbar.close()

`NumeraiClassicEvaluator`

Bases: BaseEvaluator

Evaluator for all Numerai Classic metrics.

Source code in src/numerblox/evaluation.py

class NumeraiClassicEvaluator(BaseEvaluator):
    """Evaluator for all Numerai Classic metrics."""

    def __init__(
        self,
        era_col: str = "era",
        metrics_list: List[str] = FAST_METRICS,
        custom_functions: Dict[str, Dict[str, Any]] = None,
        show_detailed_progress_bar: bool = True,
    ):
        for metric in metrics_list:
            assert metric in ALL_CLASSIC_METRICS, f"Metric '{metric}' not found. Valid metrics: {ALL_CLASSIC_METRICS}."
        super().__init__(era_col=era_col, metrics_list=metrics_list, custom_functions=custom_functions, show_detailed_progress_bar=show_detailed_progress_bar)
        self.fncv3_features = FNCV3_FEATURES

    def full_evaluation(
        self,
        dataf: pd.DataFrame,
        pred_cols: List[str],
        target_col: str = "target",
        benchmark_cols: list = None,
    ) -> pd.DataFrame:
        val_stats = pd.DataFrame()
        dataf = dataf.fillna(0.5)
        feature_cols = [col for col in dataf.columns if col.startswith("feature")]

        # Check if sufficient columns are present in dataf to compute FNC
        feature_set = set(dataf.columns)
        if set(self.fncv3_features).issubset(feature_set):
            print("Using 'v5/features.json/fncv3_features' feature set to calculate FNC metrics.")
            valid_features = self.fncv3_features
        else:
            print("WARNING: No suitable feature set defined for FNC. Skipping calculation of FNC.")
            valid_features = []

        for col in pred_cols:
            # Metrics that can be calculated for both Numerai Classic and Signals
            col_stats = self.evaluation_one_col(
                dataf=dataf,
                feature_cols=feature_cols,
                pred_col=col,
                target_col=target_col,
                benchmark_cols=benchmark_cols,
            )
            # Numerai Classic specific metrics
            if valid_features and "fncv3_mean_std_sharpe" in self.metrics_list:
                # Using only valid features defined in FNCV3_FEATURES
                fnc_v3, fn_std_v3, fn_sharpe_v3 = self.feature_neutral_mean_std_sharpe(
                    dataf=dataf,
                    pred_col=col,
                    target_col=target_col,
                    feature_names=valid_features,
                )
                col_stats.loc[col, "feature_neutral_mean_v3"] = fnc_v3
                col_stats.loc[col, "feature_neutral_std_v3"] = fn_std_v3
                col_stats.loc[col, "feature_neutral_sharpe_v3"] = fn_sharpe_v3

            val_stats = pd.concat([val_stats, col_stats], axis=0)
        return val_stats

`NumeraiSignalsEvaluator`

Bases: BaseEvaluator

Evaluator for all metrics that are relevant in Numerai Signals.

Source code in src/numerblox/evaluation.py

class NumeraiSignalsEvaluator(BaseEvaluator):
    """Evaluator for all metrics that are relevant in Numerai Signals."""

    # Columns retrievable from Numerai Signals diagnostics.
    # More info: https://forum.numer.ai/t/signals-diagnostics-guide/5950
    VALID_DIAGNOSTICS_COLS = ["validationCorrV4", "validationFncV4", "validationIcV2", "validationRic"]

    def __init__(
        self,
        era_col: str = "date",
        metrics_list: List[str] = FAST_METRICS,
        custom_functions: Dict[str, Dict[str, Any]] = None,
        show_detailed_progress_bar: bool = True,
    ):
        for metric in metrics_list:
            assert metric in ALL_SIGNALS_METRICS, f"Metric '{metric}' not found. Valid metrics: {ALL_SIGNALS_METRICS}."
        super().__init__(era_col=era_col, metrics_list=metrics_list, custom_functions=custom_functions, show_detailed_progress_bar=show_detailed_progress_bar)

    def get_diagnostics(self, val_dataf: pd.DataFrame, model_name: str, key: Key, timeout_min: int = 2, col: Union[str, None] = "validationFncV4") -> pd.DataFrame:
        """
        Retrieved neutralized validation correlation by era. \n
        Calculated on Numerai servers. \n
        :param val_dataf: A DataFrame containing prediction, date, ticker and data_type columns. \n
        data_type column should contain 'validation' instances. \n
        :param model_name: Any model name for which you have authentication credentials. \n
        :param key: Key object to authenticate upload of diagnostics. \n
        :param timeout_min: How many minutes to wait on diagnostics Computing on Numerai servers before timing out. \n
        :param col: Which column to return. Should be one of ['validationCorrV4', 'validationFncV4', 'validationIcV2', 'validationRic']. If None, all columns will be returned. \n
        2 minutes by default. \n
        :return: Pandas Series with era as index and neutralized validation correlations (validationCorr).
        """
        assert col in self.VALID_DIAGNOSTICS_COLS or col is None, f"corr_col should be one of {self.VALID_DIAGNOSTICS_COLS} or None. Got: '{col}'"
        api = SignalsAPI(public_id=key.pub_id, secret_key=key.secret_key)
        model_id = api.get_models()[model_name]
        diagnostics_id = api.upload_diagnostics(df=val_dataf, model_id=model_id)
        data = self.__await_diagnostics(
            api=api,
            model_id=model_id,
            diagnostics_id=diagnostics_id,
            timeout_min=timeout_min,
        )
        diagnostics_df = pd.DataFrame(data["perEraDiagnostics"]).set_index("era")
        diagnostics_df.index = pd.to_datetime(diagnostics_df.index)
        return_cols = [col] if col is not None else self.VALID_DIAGNOSTICS_COLS
        return diagnostics_df[return_cols]

    @staticmethod
    def __await_diagnostics(
        api: SignalsAPI,
        model_id: str,
        diagnostics_id: str,
        timeout_min: int,
        interval_sec: int = 15,
    ):
        """
        Wait for diagnostics to be uploaded.
        Try every 'interval_sec' seconds until 'timeout_min' minutes have passed.
        """
        timeout = time.time() + 60 * timeout_min
        data = {"status": "not_done"}
        while time.time() < timeout:
            data = api.diagnostics(model_id=model_id, diagnostics_id=diagnostics_id)[0]
            if not data["status"] == "done":
                print(f"Diagnostics not processed yet. Sleeping for another {interval_sec} seconds.")
                time.sleep(interval_sec)
            else:
                break
        if not data["status"] == "done":
            raise Exception(f"Diagnostics couldn't be retrieved within {timeout_min} minutes after uploading. Check if Numerai API is offline.")
        return data

`__await_diagnostics(api, model_id, diagnostics_id, timeout_min, interval_sec=15)` `staticmethod`

Wait for diagnostics to be uploaded. Try every 'interval_sec' seconds until 'timeout_min' minutes have passed.

Source code in src/numerblox/evaluation.py

@staticmethod
def __await_diagnostics(
    api: SignalsAPI,
    model_id: str,
    diagnostics_id: str,
    timeout_min: int,
    interval_sec: int = 15,
):
    """
    Wait for diagnostics to be uploaded.
    Try every 'interval_sec' seconds until 'timeout_min' minutes have passed.
    """
    timeout = time.time() + 60 * timeout_min
    data = {"status": "not_done"}
    while time.time() < timeout:
        data = api.diagnostics(model_id=model_id, diagnostics_id=diagnostics_id)[0]
        if not data["status"] == "done":
            print(f"Diagnostics not processed yet. Sleeping for another {interval_sec} seconds.")
            time.sleep(interval_sec)
        else:
            break
    if not data["status"] == "done":
        raise Exception(f"Diagnostics couldn't be retrieved within {timeout_min} minutes after uploading. Check if Numerai API is offline.")
    return data

`get_diagnostics(val_dataf, model_name, key, timeout_min=2, col='validationFncV4')`

Retrieved neutralized validation correlation by era.

Calculated on Numerai servers.

:param val_dataf: A DataFrame containing prediction, date, ticker and data_type columns.

data_type column should contain 'validation' instances.

:param model_name: Any model name for which you have authentication credentials.

:param key: Key object to authenticate upload of diagnostics.

:param timeout_min: How many minutes to wait on diagnostics Computing on Numerai servers before timing out.

:param col: Which column to return. Should be one of ['validationCorrV4', 'validationFncV4', 'validationIcV2', 'validationRic']. If None, all columns will be returned.

2 minutes by default.

:return: Pandas Series with era as index and neutralized validation correlations (validationCorr).

Source code in src/numerblox/evaluation.py

def get_diagnostics(self, val_dataf: pd.DataFrame, model_name: str, key: Key, timeout_min: int = 2, col: Union[str, None] = "validationFncV4") -> pd.DataFrame:
    """
    Retrieved neutralized validation correlation by era. \n
    Calculated on Numerai servers. \n
    :param val_dataf: A DataFrame containing prediction, date, ticker and data_type columns. \n
    data_type column should contain 'validation' instances. \n
    :param model_name: Any model name for which you have authentication credentials. \n
    :param key: Key object to authenticate upload of diagnostics. \n
    :param timeout_min: How many minutes to wait on diagnostics Computing on Numerai servers before timing out. \n
    :param col: Which column to return. Should be one of ['validationCorrV4', 'validationFncV4', 'validationIcV2', 'validationRic']. If None, all columns will be returned. \n
    2 minutes by default. \n
    :return: Pandas Series with era as index and neutralized validation correlations (validationCorr).
    """
    assert col in self.VALID_DIAGNOSTICS_COLS or col is None, f"corr_col should be one of {self.VALID_DIAGNOSTICS_COLS} or None. Got: '{col}'"
    api = SignalsAPI(public_id=key.pub_id, secret_key=key.secret_key)
    model_id = api.get_models()[model_name]
    diagnostics_id = api.upload_diagnostics(df=val_dataf, model_id=model_id)
    data = self.__await_diagnostics(
        api=api,
        model_id=model_id,
        diagnostics_id=diagnostics_id,
        timeout_min=timeout_min,
    )
    diagnostics_df = pd.DataFrame(data["perEraDiagnostics"]).set_index("era")
    diagnostics_df.index = pd.to_datetime(diagnostics_df.index)
    return_cols = [col] if col is not None else self.VALID_DIAGNOSTICS_COLS
    return diagnostics_df[return_cols]

Submission

`BaseSubmitter`

Bases: BaseIO

Basic functionality for submitting to Numerai. Uses numerapi under the hood. More info: https://numerapi.readthedocs.io

:param directory_path: Directory to store and read submissions from. :param api: NumerAPI, SignalsAPI or CryptoAPI :param max_retries: Maximum number of retries for uploading predictions to Numerai. :param sleep_time: Time to sleep between uploading retries. :param fail_silently: Whether to skip uploading to Numerai without raising an error. Useful for if you are uploading many models in a loop and want to skip models that fail to upload.

Source code in src/numerblox/submission.py

class BaseSubmitter(BaseIO):
    """
    Basic functionality for submitting to Numerai.
    Uses numerapi under the hood.
    More info: https://numerapi.readthedocs.io

    :param directory_path: Directory to store and read submissions from.
    :param api: NumerAPI, SignalsAPI or CryptoAPI
    :param max_retries: Maximum number of retries for uploading predictions to Numerai.
    :param sleep_time: Time to sleep between uploading retries.
    :param fail_silently: Whether to skip uploading to Numerai without raising an error.
    Useful for if you are uploading many models in a loop and want to skip models that fail to upload.
    """

    def __init__(self, directory_path: str, api: Union[NumerAPI, SignalsAPI, CryptoAPI], max_retries: int, sleep_time: int, fail_silently: bool):
        super().__init__(directory_path)
        self.api = api
        self.max_retries = max_retries
        self.sleep_time = sleep_time
        self.fail_silently = fail_silently

    @abstractmethod
    def save_csv(
        self,
        dataf: pd.DataFrame,
        file_name: str,
        cols: Union[str, list],
        *args,
        **kwargs,
    ):
        """
        For Numerai Classic: Save index column + 'cols' (targets) to CSV.
        For Numerai Signals: Save ticker, date, data_type and signal columns to CSV.
        """
        ...

    def upload_predictions(self, file_name: str, model_name: str, *args, **kwargs):
        """
        Upload CSV file to Numerai for given model name.
        :param file_name: File name/path relative to directory_path.
        :param model_name: Lowercase raw model name (For example, 'integration_test').
        """
        full_path = str(self.dir / file_name)
        model_id = self._get_model_id(model_name=model_name)
        api_type = str(self.api.__class__.__name__)
        print(f"{api_type}: Uploading predictions from '{full_path}' for model '{model_name}' (model_id='{model_id}')")
        for attempt in range(self.max_retries):
            try:
                self.api.upload_predictions(file_path=full_path, model_id=model_id, *args, **kwargs)
                print(f"{api_type} submission of '{full_path}' for '{model_name}' is successful!")
                return
            except Exception as e:
                if attempt < self.max_retries - 1:  # i.e. not the last attempt
                    print(f"Failed to upload '{full_path}' for '{model_name}' to Numerai. Retrying in {self.sleep_time} seconds...")
                    print(f"Error: {e}")
                    time.sleep(self.sleep_time)
                else:
                    if self.fail_silently:
                        print(f"Failed to upload'{full_path}' for '{model_name}' to Numerai. Skipping...")
                        print(f"Error: {e}")
                    else:
                        print(f"Failed to upload '{full_path}' for '{model_name}' to Numerai after {self.max_retries} attempts.")
                        raise e

    def full_submission(
        self,
        dataf: pd.DataFrame,
        model_name: str,
        cols: Union[str, list],
        file_name: str = "submission.csv",
        *args,
        **kwargs,
    ):
        """
        Save DataFrame to csv and upload predictions through API.

        :param dataf: Main DataFrame containing `cols`.
        :param model_name: Lowercase Numerai model name.
        :param file_name: path to save model to relative to base directory.
        :param cols: Columns to be saved in submission file.
        1 prediction column for Numerai Classic.
        At least 1 prediction column and 1 ticker column for Numerai Signals.
        *args, **kwargs are passed to numerapi API.
        For example `version` argument in Numerai Classic submissions.
        """
        self.save_csv(dataf=dataf, file_name=file_name, cols=cols)
        self.upload_predictions(file_name=file_name, model_name=model_name, *args, **kwargs)

    def combine_csvs(self, csv_paths: list, aux_cols: list, era_col: str = None, pred_col: str = "prediction") -> pd.DataFrame:
        """
        Read in csv files and combine all predictions with a rank mean. \n
        Multi-target predictions will be averaged out. \n
        :param csv_paths: List of full paths to .csv prediction files. \n
        :param aux_cols: ['id'] for Numerai Classic. \n
        ['ticker', 'last_friday', 'data_type'], for example, with Numerai Signals. \n
        :param era_col: Column indicating era ('era' or 'last_friday'). \n
        Will be used for Grouping the rank mean if given. Skip groupby if no era_col provided. \n
        :param pred_col: 'prediction' for Numerai Classic and 'signal' for Numerai Signals.
        """
        all_datafs = [pd.read_csv(path, index_col=aux_cols) for path in tqdm(csv_paths)]
        final_dataf = pd.concat(all_datafs, axis="columns")
        # Remove issue of duplicate columns
        numeric_cols = final_dataf.select_dtypes(include=np.number).columns
        final_dataf.rename({k: str(v) for k, v in zip(numeric_cols, range(len(numeric_cols)))}, axis=1, inplace=True)
        # Combine all numeric columns with rank mean
        num_dataf = final_dataf.select_dtypes(include=np.number)
        num_dataf = num_dataf.groupby(era_col) if era_col else num_dataf
        final_dataf[pred_col] = num_dataf.rank(pct=True, method="first").mean(axis=1)
        return final_dataf[[pred_col]]

    def _get_model_id(self, model_name: str) -> str:
        """
        Get ID needed for prediction uploading.
        :param model_name: Raw lowercase model name
        of Numerai model that you have access to.
        """
        return self.get_model_mapping[model_name]

    @property
    def get_model_mapping(self) -> dict:
        """Mapping between raw model names and model IDs."""
        return self.api.get_models()

    def _check_value_range(self, dataf: pd.DataFrame, cols: Union[str, list]):
        """Check if all predictions are in range (0...1)."""
        cols = [cols] if isinstance(cols, str) else cols
        for col in cols:
            if not dataf[col].between(0, 1).all():
                min_val, max_val = dataf[col].min(), dataf[col].max()
                raise ValueError(
                    f"Values must be between 0 and 1. \
Found min value of '{min_val}' and max value of '{max_val}' for column '{col}'."
                )

    def __call__(
        self,
        dataf: pd.DataFrame,
        model_name: str,
        file_name: str = "submission.csv",
        cols: Union[str, list] = "prediction",
        *args,
        **kwargs,
    ):
        """
        The most common use case will be to create a CSV and submit it immediately after that.
        full_submission handles this.
        """
        self.full_submission(
            dataf=dataf,
            file_name=file_name,
            model_name=model_name,
            cols=cols,
            *args,
            **kwargs,
        )

`get_model_mapping` `property`

Mapping between raw model names and model IDs.

`call(dataf, model_name, file_name='submission.csv', cols='prediction', *args, **kwargs)`

The most common use case will be to create a CSV and submit it immediately after that. full_submission handles this.

Source code in src/numerblox/submission.py

def __call__(
    self,
    dataf: pd.DataFrame,
    model_name: str,
    file_name: str = "submission.csv",
    cols: Union[str, list] = "prediction",
    *args,
    **kwargs,
):
    """
    The most common use case will be to create a CSV and submit it immediately after that.
    full_submission handles this.
    """
    self.full_submission(
        dataf=dataf,
        file_name=file_name,
        model_name=model_name,
        cols=cols,
        *args,
        **kwargs,
    )

`_check_value_range(dataf, cols)`

Check if all predictions are in range (0...1).

Source code in src/numerblox/submission.py

    def _check_value_range(self, dataf: pd.DataFrame, cols: Union[str, list]):
        """Check if all predictions are in range (0...1)."""
        cols = [cols] if isinstance(cols, str) else cols
        for col in cols:
            if not dataf[col].between(0, 1).all():
                min_val, max_val = dataf[col].min(), dataf[col].max()
                raise ValueError(
                    f"Values must be between 0 and 1. \
Found min value of '{min_val}' and max value of '{max_val}' for column '{col}'."
                )

`_get_model_id(model_name)`

Get ID needed for prediction uploading. :param model_name: Raw lowercase model name of Numerai model that you have access to.

Source code in src/numerblox/submission.py

def _get_model_id(self, model_name: str) -> str:
    """
    Get ID needed for prediction uploading.
    :param model_name: Raw lowercase model name
    of Numerai model that you have access to.
    """
    return self.get_model_mapping[model_name]

`combine_csvs(csv_paths, aux_cols, era_col=None, pred_col='prediction')`

Read in csv files and combine all predictions with a rank mean.

Multi-target predictions will be averaged out.

:param csv_paths: List of full paths to .csv prediction files.

:param aux_cols: ['id'] for Numerai Classic.

['ticker', 'last_friday', 'data_type'], for example, with Numerai Signals.

:param era_col: Column indicating era ('era' or 'last_friday').

Will be used for Grouping the rank mean if given. Skip groupby if no era_col provided.

:param pred_col: 'prediction' for Numerai Classic and 'signal' for Numerai Signals.

Source code in src/numerblox/submission.py

def combine_csvs(self, csv_paths: list, aux_cols: list, era_col: str = None, pred_col: str = "prediction") -> pd.DataFrame:
    """
    Read in csv files and combine all predictions with a rank mean. \n
    Multi-target predictions will be averaged out. \n
    :param csv_paths: List of full paths to .csv prediction files. \n
    :param aux_cols: ['id'] for Numerai Classic. \n
    ['ticker', 'last_friday', 'data_type'], for example, with Numerai Signals. \n
    :param era_col: Column indicating era ('era' or 'last_friday'). \n
    Will be used for Grouping the rank mean if given. Skip groupby if no era_col provided. \n
    :param pred_col: 'prediction' for Numerai Classic and 'signal' for Numerai Signals.
    """
    all_datafs = [pd.read_csv(path, index_col=aux_cols) for path in tqdm(csv_paths)]
    final_dataf = pd.concat(all_datafs, axis="columns")
    # Remove issue of duplicate columns
    numeric_cols = final_dataf.select_dtypes(include=np.number).columns
    final_dataf.rename({k: str(v) for k, v in zip(numeric_cols, range(len(numeric_cols)))}, axis=1, inplace=True)
    # Combine all numeric columns with rank mean
    num_dataf = final_dataf.select_dtypes(include=np.number)
    num_dataf = num_dataf.groupby(era_col) if era_col else num_dataf
    final_dataf[pred_col] = num_dataf.rank(pct=True, method="first").mean(axis=1)
    return final_dataf[[pred_col]]

`full_submission(dataf, model_name, cols, file_name='submission.csv', *args, **kwargs)`

Save DataFrame to csv and upload predictions through API.

:param dataf: Main DataFrame containing cols. :param model_name: Lowercase Numerai model name. :param file_name: path to save model to relative to base directory. :param cols: Columns to be saved in submission file. 1 prediction column for Numerai Classic. At least 1 prediction column and 1 ticker column for Numerai Signals. args, *kwargs are passed to numerapi API. For example version argument in Numerai Classic submissions.

Source code in src/numerblox/submission.py

def full_submission(
    self,
    dataf: pd.DataFrame,
    model_name: str,
    cols: Union[str, list],
    file_name: str = "submission.csv",
    *args,
    **kwargs,
):
    """
    Save DataFrame to csv and upload predictions through API.

    :param dataf: Main DataFrame containing `cols`.
    :param model_name: Lowercase Numerai model name.
    :param file_name: path to save model to relative to base directory.
    :param cols: Columns to be saved in submission file.
    1 prediction column for Numerai Classic.
    At least 1 prediction column and 1 ticker column for Numerai Signals.
    *args, **kwargs are passed to numerapi API.
    For example `version` argument in Numerai Classic submissions.
    """
    self.save_csv(dataf=dataf, file_name=file_name, cols=cols)
    self.upload_predictions(file_name=file_name, model_name=model_name, *args, **kwargs)

`save_csv(dataf, file_name, cols, *args, **kwargs)` `abstractmethod`

For Numerai Classic: Save index column + 'cols' (targets) to CSV. For Numerai Signals: Save ticker, date, data_type and signal columns to CSV.

Source code in src/numerblox/submission.py

@abstractmethod
def save_csv(
    self,
    dataf: pd.DataFrame,
    file_name: str,
    cols: Union[str, list],
    *args,
    **kwargs,
):
    """
    For Numerai Classic: Save index column + 'cols' (targets) to CSV.
    For Numerai Signals: Save ticker, date, data_type and signal columns to CSV.
    """
    ...

`upload_predictions(file_name, model_name, *args, **kwargs)`

Upload CSV file to Numerai for given model name. :param file_name: File name/path relative to directory_path. :param model_name: Lowercase raw model name (For example, 'integration_test').

Source code in src/numerblox/submission.py

def upload_predictions(self, file_name: str, model_name: str, *args, **kwargs):
    """
    Upload CSV file to Numerai for given model name.
    :param file_name: File name/path relative to directory_path.
    :param model_name: Lowercase raw model name (For example, 'integration_test').
    """
    full_path = str(self.dir / file_name)
    model_id = self._get_model_id(model_name=model_name)
    api_type = str(self.api.__class__.__name__)
    print(f"{api_type}: Uploading predictions from '{full_path}' for model '{model_name}' (model_id='{model_id}')")
    for attempt in range(self.max_retries):
        try:
            self.api.upload_predictions(file_path=full_path, model_id=model_id, *args, **kwargs)
            print(f"{api_type} submission of '{full_path}' for '{model_name}' is successful!")
            return
        except Exception as e:
            if attempt < self.max_retries - 1:  # i.e. not the last attempt
                print(f"Failed to upload '{full_path}' for '{model_name}' to Numerai. Retrying in {self.sleep_time} seconds...")
                print(f"Error: {e}")
                time.sleep(self.sleep_time)
            else:
                if self.fail_silently:
                    print(f"Failed to upload'{full_path}' for '{model_name}' to Numerai. Skipping...")
                    print(f"Error: {e}")
                else:
                    print(f"Failed to upload '{full_path}' for '{model_name}' to Numerai after {self.max_retries} attempts.")
                    raise e

`NumerBaySubmitter`

Bases: BaseSubmitter

Submit to NumerBay to fulfill sale orders, in addition to submission to Numerai.

:param tournament_submitter: Base tournament submitter (NumeraiClassicSubmitter or NumeraiSignalsSubmitter). This submitter will use the same directory path. :param upload_to_numerai: Whether to also submit to Numerai using the tournament submitter. Defaults to True, set to False to only upload to NumerBay. :param numerbay_username: NumerBay username :param numerbay_password: NumerBay password

Source code in src/numerblox/submission.py

class NumerBaySubmitter(BaseSubmitter):
    """
    Submit to NumerBay to fulfill sale orders, in addition to submission to Numerai.

    :param tournament_submitter: Base tournament submitter (NumeraiClassicSubmitter or NumeraiSignalsSubmitter). This submitter will use the same directory path.
    :param upload_to_numerai: Whether to also submit to Numerai using the tournament submitter. Defaults to True, set to False to only upload to NumerBay.
    :param numerbay_username: NumerBay username
    :param numerbay_password: NumerBay password
    """

    def __init__(self, tournament_submitter: Union[NumeraiClassicSubmitter, NumeraiSignalsSubmitter], upload_to_numerai: bool = True, numerbay_username: str = None, numerbay_password: str = None):
        super().__init__(directory_path=str(tournament_submitter.dir), api=tournament_submitter.api, max_retries=tournament_submitter.max_retries, sleep_time=tournament_submitter.sleep_time, fail_silently=tournament_submitter.fail_silently)
        from numerbay import NumerBay

        self.numerbay_api = NumerBay(username=numerbay_username, password=numerbay_password)
        self.tournament_submitter = tournament_submitter
        self.upload_to_numerai = upload_to_numerai

    def upload_predictions(self, file_name: str, model_name: str, numerbay_product_full_name: str, *args, **kwargs):
        """
        Upload CSV file to NumerBay (and Numerai if 'upload_to_numerai' is True) for given model name and NumerBay product full name.
        :param file_name: File name/path relative to directory_path.
        :param model_name: Lowercase raw model name (For example, 'integration_test').
        :param numerbay_product_full_name: NumerBay product full name in the format of [category]-[product name], e.g. 'numerai-predictions-numerbay'
        """
        if self.upload_to_numerai:
            self.tournament_submitter.upload_predictions(file_name, model_name, *args, **kwargs)

        full_path = str(self.dir / file_name)
        api_type = str(self.numerbay_api.__class__.__name__)
        print(f"{api_type}: Uploading predictions from '{full_path}' for NumerBay product '{numerbay_product_full_name}'")
        artifact = self.numerbay_api.upload_artifact(str(full_path), product_full_name=numerbay_product_full_name)
        if artifact:
            print(f"{api_type} submission of '{full_path}' for NumerBay product [bold blue]{numerbay_product_full_name} is successful!")
        else:
            print(f"""WARNING: Upload skipped for NumerBay product '{numerbay_product_full_name}', 
                  the product uses buyer-side encryption but does not have any active sale order to upload for.""")

    def full_submission(
        self,
        dataf: pd.DataFrame,
        model_name: str,
        cols: Union[str, list],
        numerbay_product_full_name: str,
        file_name: str = "submission.csv",
        *args,
        **kwargs,
    ):
        """
        Save DataFrame to csv and upload predictions through API.

        :param dataf: Main DataFrame containing `cols`.
        :param model_name: Lowercase Numerai model name.
        :param numerbay_product_full_name: NumerBay product full name in the format of [category]-[product name], e.g. 'numerai-predictions-numerbay'
        :param file_name: path to save model to relative to base directory.
        :param cols: Columns to be saved in submission file.
        1 prediction column for Numerai Classic.
        At least 1 prediction column and 1 ticker column for Numerai Signals.
        *args, **kwargs are passed to numerapi API.
        For example `version` argument in Numerai Classic submissions.
        """
        self.save_csv(dataf=dataf, file_name=file_name, cols=cols)
        self.upload_predictions(file_name=file_name, model_name=model_name, numerbay_product_full_name=numerbay_product_full_name, *args, **kwargs)

    def combine_csvs(self, *args, **kwargs) -> pd.DataFrame:
        return self.tournament_submitter.combine_csvs(*args, **kwargs)

    def save_csv(self, *args, **kwargs):
        self.tournament_submitter.save_csv(*args, **kwargs)

    @property
    def get_model_mapping(self) -> dict:
        return self.tournament_submitter.api.get_models()

    def __call__(
        self,
        dataf: pd.DataFrame,
        model_name: str,
        numerbay_product_full_name: str,
        file_name: str = "submission.csv",
        cols: Union[str, list] = "prediction",
        *args,
        **kwargs,
    ):
        """
        The most common use case will be to create a CSV and submit it immediately after that.
        full_submission handles this.
        """
        self.full_submission(
            dataf=dataf,
            file_name=file_name,
            model_name=model_name,
            numerbay_product_full_name=numerbay_product_full_name,
            cols=cols,
            *args,
            **kwargs,
        )

`call(dataf, model_name, numerbay_product_full_name, file_name='submission.csv', cols='prediction', *args, **kwargs)`

The most common use case will be to create a CSV and submit it immediately after that. full_submission handles this.

Source code in src/numerblox/submission.py

def __call__(
    self,
    dataf: pd.DataFrame,
    model_name: str,
    numerbay_product_full_name: str,
    file_name: str = "submission.csv",
    cols: Union[str, list] = "prediction",
    *args,
    **kwargs,
):
    """
    The most common use case will be to create a CSV and submit it immediately after that.
    full_submission handles this.
    """
    self.full_submission(
        dataf=dataf,
        file_name=file_name,
        model_name=model_name,
        numerbay_product_full_name=numerbay_product_full_name,
        cols=cols,
        *args,
        **kwargs,
    )

`full_submission(dataf, model_name, cols, numerbay_product_full_name, file_name='submission.csv', *args, **kwargs)`

Save DataFrame to csv and upload predictions through API.

:param dataf: Main DataFrame containing cols. :param model_name: Lowercase Numerai model name. :param numerbay_product_full_name: NumerBay product full name in the format of [category]-[product name], e.g. 'numerai-predictions-numerbay' :param file_name: path to save model to relative to base directory. :param cols: Columns to be saved in submission file. 1 prediction column for Numerai Classic. At least 1 prediction column and 1 ticker column for Numerai Signals. args, *kwargs are passed to numerapi API. For example version argument in Numerai Classic submissions.

Source code in src/numerblox/submission.py

def full_submission(
    self,
    dataf: pd.DataFrame,
    model_name: str,
    cols: Union[str, list],
    numerbay_product_full_name: str,
    file_name: str = "submission.csv",
    *args,
    **kwargs,
):
    """
    Save DataFrame to csv and upload predictions through API.

    :param dataf: Main DataFrame containing `cols`.
    :param model_name: Lowercase Numerai model name.
    :param numerbay_product_full_name: NumerBay product full name in the format of [category]-[product name], e.g. 'numerai-predictions-numerbay'
    :param file_name: path to save model to relative to base directory.
    :param cols: Columns to be saved in submission file.
    1 prediction column for Numerai Classic.
    At least 1 prediction column and 1 ticker column for Numerai Signals.
    *args, **kwargs are passed to numerapi API.
    For example `version` argument in Numerai Classic submissions.
    """
    self.save_csv(dataf=dataf, file_name=file_name, cols=cols)
    self.upload_predictions(file_name=file_name, model_name=model_name, numerbay_product_full_name=numerbay_product_full_name, *args, **kwargs)

`upload_predictions(file_name, model_name, numerbay_product_full_name, *args, **kwargs)`

Upload CSV file to NumerBay (and Numerai if 'upload_to_numerai' is True) for given model name and NumerBay product full name. :param file_name: File name/path relative to directory_path. :param model_name: Lowercase raw model name (For example, 'integration_test'). :param numerbay_product_full_name: NumerBay product full name in the format of [category]-[product name], e.g. 'numerai-predictions-numerbay'

Source code in src/numerblox/submission.py

def upload_predictions(self, file_name: str, model_name: str, numerbay_product_full_name: str, *args, **kwargs):
    """
    Upload CSV file to NumerBay (and Numerai if 'upload_to_numerai' is True) for given model name and NumerBay product full name.
    :param file_name: File name/path relative to directory_path.
    :param model_name: Lowercase raw model name (For example, 'integration_test').
    :param numerbay_product_full_name: NumerBay product full name in the format of [category]-[product name], e.g. 'numerai-predictions-numerbay'
    """
    if self.upload_to_numerai:
        self.tournament_submitter.upload_predictions(file_name, model_name, *args, **kwargs)

    full_path = str(self.dir / file_name)
    api_type = str(self.numerbay_api.__class__.__name__)
    print(f"{api_type}: Uploading predictions from '{full_path}' for NumerBay product '{numerbay_product_full_name}'")
    artifact = self.numerbay_api.upload_artifact(str(full_path), product_full_name=numerbay_product_full_name)
    if artifact:
        print(f"{api_type} submission of '{full_path}' for NumerBay product [bold blue]{numerbay_product_full_name} is successful!")
    else:
        print(f"""WARNING: Upload skipped for NumerBay product '{numerbay_product_full_name}', 
              the product uses buyer-side encryption but does not have any active sale order to upload for.""")

`NumeraiClassicSubmitter`

Bases: BaseSubmitter

Submit for Numerai Classic.

:param directory_path: Base directory to save and read prediction files from.

:param key: Key object containing valid credentials for Numerai Classic.

:param max_retries: Maximum number of retries for uploading predictions to Numerai. :param sleep_time: Time to sleep between uploading retries. :param fail_silently: Whether to skip uploading to Numerai without raising an error. Useful for if you are uploading many models in a loop and want to skip models that fail to upload. args, *kwargs will be passed to NumerAPI initialization.

Source code in src/numerblox/submission.py

class NumeraiClassicSubmitter(BaseSubmitter):
    """
    Submit for Numerai Classic.

    :param directory_path: Base directory to save and read prediction files from. \n
    :param key: Key object containing valid credentials for Numerai Classic. \n
    :param max_retries: Maximum number of retries for uploading predictions to Numerai.
    :param sleep_time: Time to sleep between uploading retries.
    :param fail_silently: Whether to skip uploading to Numerai without raising an error.
    Useful for if you are uploading many models in a loop and want to skip models that fail to upload.
    *args, **kwargs will be passed to NumerAPI initialization.
    """

    def __init__(self, directory_path: str, key: Key, max_retries: int = 2, sleep_time: int = 10, fail_silently=False, *args, **kwargs):
        api = NumerAPI(public_id=key.pub_id, secret_key=key.secret_key, *args, **kwargs)
        super().__init__(directory_path=directory_path, api=api, max_retries=max_retries, sleep_time=sleep_time, fail_silently=fail_silently)

    def save_csv(
        self,
        dataf: pd.DataFrame,
        file_name: str = "submission.csv",
        cols: str = "prediction",
        *args,
        **kwargs,
    ):
        """
        :param dataf: DataFrame which should have at least the following columns:
        1. id (as index column)
        2. cols (for example, 'prediction_mymodel'). Will be saved in 'prediction' column
        :param file_name: .csv file path.
        :param cols: Prediction column name.
        For example, 'prediction' or 'prediction_mymodel'.
        """
        sub_dataf = deepcopy(dataf)
        self._check_value_range(dataf=sub_dataf, cols=cols)

        full_path = str(self.dir / file_name)
        print(f"Saving predictions CSV to '{full_path}'.")
        sub_dataf.loc[:, "prediction"] = sub_dataf[cols]
        sub_dataf.loc[:, "prediction"].to_csv(full_path, *args, **kwargs)

`save_csv(dataf, file_name='submission.csv', cols='prediction', *args, **kwargs)`

:param dataf: DataFrame which should have at least the following columns: 1. id (as index column) 2. cols (for example, 'prediction_mymodel'). Will be saved in 'prediction' column :param file_name: .csv file path. :param cols: Prediction column name. For example, 'prediction' or 'prediction_mymodel'.

Source code in src/numerblox/submission.py

def save_csv(
    self,
    dataf: pd.DataFrame,
    file_name: str = "submission.csv",
    cols: str = "prediction",
    *args,
    **kwargs,
):
    """
    :param dataf: DataFrame which should have at least the following columns:
    1. id (as index column)
    2. cols (for example, 'prediction_mymodel'). Will be saved in 'prediction' column
    :param file_name: .csv file path.
    :param cols: Prediction column name.
    For example, 'prediction' or 'prediction_mymodel'.
    """
    sub_dataf = deepcopy(dataf)
    self._check_value_range(dataf=sub_dataf, cols=cols)

    full_path = str(self.dir / file_name)
    print(f"Saving predictions CSV to '{full_path}'.")
    sub_dataf.loc[:, "prediction"] = sub_dataf[cols]
    sub_dataf.loc[:, "prediction"].to_csv(full_path, *args, **kwargs)

`NumeraiCryptoSubmitter`

Bases: BaseSubmitter

Submit for Numerai Crypto.

:param directory_path: Base directory to save and read prediction files from.

:param key: Key object containing valid credentials for Numerai Crypto.

:param max_retries: Maximum number of retries for uploading predictions to Numerai. :param sleep_time: Time to sleep between uploading retries. :param fail_silently: Whether to skip uploading to Numerai without raising an error. Useful for if you are uploading many models in a loop and want to skip models that fail to upload. args, *kwargs will be passed to CryptoAPI initialization.

Source code in src/numerblox/submission.py

class NumeraiCryptoSubmitter(BaseSubmitter):
    """
    Submit for Numerai Crypto.

    :param directory_path: Base directory to save and read prediction files from. \n
    :param key: Key object containing valid credentials for Numerai Crypto. \n
    :param max_retries: Maximum number of retries for uploading predictions to Numerai.
    :param sleep_time: Time to sleep between uploading retries.
    :param fail_silently: Whether to skip uploading to Numerai without raising an error.
    Useful for if you are uploading many models in a loop and want to skip models that fail to upload.
    *args, **kwargs will be passed to CryptoAPI initialization.
    """

    def __init__(self, directory_path: str, key: Key, max_retries: int = 2, sleep_time: int = 10, fail_silently=False, *args, **kwargs):
        api = CryptoAPI(public_id=key.pub_id, secret_key=key.secret_key, *args, **kwargs)
        super().__init__(directory_path=directory_path, api=api, max_retries=max_retries, sleep_time=sleep_time, fail_silently=fail_silently)

    def save_csv(self, dataf: pd.DataFrame, cols: list, file_name: str = "submission.csv", *args, **kwargs):
        """
        :param dataf: DataFrame which should have at least the following columns:
         1. symbol col
         2. signal (Values between 0 and 1 (exclusive))
         Additional columns for if you include validation data (optional):
         3. date (YYYY-MM-DD format date indication)
         4. data_type ('val' and 'live' partitions)

         :param cols: All cols that are saved in CSV.
         cols should contain at least 1 ticker column and a 'signal' column.
         For example: ['bloomberg_ticker', 'signal']
         :param file_name: .csv file path.
        """
        assert "symbol" in cols, "'symbol' column is required for Numerai Crypto submissions."
        self._check_value_range(dataf=dataf, cols="signal")

        full_path = str(self.dir / file_name)
        print(f"Saving Signals predictions CSV to '{full_path}'.")
        dataf.loc[:, cols].reset_index(drop=True).to_csv(full_path, index=False, *args, **kwargs)

`save_csv(dataf, cols, file_name='submission.csv', *args, **kwargs)`

:param dataf: DataFrame which should have at least the following columns: 1. symbol col 2. signal (Values between 0 and 1 (exclusive)) Additional columns for if you include validation data (optional): 3. date (YYYY-MM-DD format date indication) 4. data_type ('val' and 'live' partitions)

:param cols: All cols that are saved in CSV. cols should contain at least 1 ticker column and a 'signal' column. For example: ['bloomberg_ticker', 'signal'] :param file_name: .csv file path.

Source code in src/numerblox/submission.py

def save_csv(self, dataf: pd.DataFrame, cols: list, file_name: str = "submission.csv", *args, **kwargs):
    """
    :param dataf: DataFrame which should have at least the following columns:
     1. symbol col
     2. signal (Values between 0 and 1 (exclusive))
     Additional columns for if you include validation data (optional):
     3. date (YYYY-MM-DD format date indication)
     4. data_type ('val' and 'live' partitions)

     :param cols: All cols that are saved in CSV.
     cols should contain at least 1 ticker column and a 'signal' column.
     For example: ['bloomberg_ticker', 'signal']
     :param file_name: .csv file path.
    """
    assert "symbol" in cols, "'symbol' column is required for Numerai Crypto submissions."
    self._check_value_range(dataf=dataf, cols="signal")

    full_path = str(self.dir / file_name)
    print(f"Saving Signals predictions CSV to '{full_path}'.")
    dataf.loc[:, cols].reset_index(drop=True).to_csv(full_path, index=False, *args, **kwargs)

`NumeraiSignalsSubmitter`

Bases: BaseSubmitter

Submit for Numerai Signals.

:param directory_path: Base directory to save and read prediction files from.

:param key: Key object containing valid credentials for Numerai Signals.

:param max_retries: Maximum number of retries for uploading predictions to Numerai. :param sleep_time: Time to sleep between uploading retries. :param fail_silently: Whether to skip uploading to Numerai without raising an error. Useful for if you are uploading many models in a loop and want to skip models that fail to upload. args, *kwargs will be passed to SignalsAPI initialization.

Source code in src/numerblox/submission.py

class NumeraiSignalsSubmitter(BaseSubmitter):
    """
    Submit for Numerai Signals.

    :param directory_path: Base directory to save and read prediction files from. \n
    :param key: Key object containing valid credentials for Numerai Signals. \n
    :param max_retries: Maximum number of retries for uploading predictions to Numerai.
    :param sleep_time: Time to sleep between uploading retries.
    :param fail_silently: Whether to skip uploading to Numerai without raising an error.
    Useful for if you are uploading many models in a loop and want to skip models that fail to upload.
    *args, **kwargs will be passed to SignalsAPI initialization.
    """

    def __init__(self, directory_path: str, key: Key, max_retries: int = 2, sleep_time: int = 10, fail_silently=False, *args, **kwargs):
        api = SignalsAPI(public_id=key.pub_id, secret_key=key.secret_key, *args, **kwargs)
        super().__init__(directory_path=directory_path, api=api, max_retries=max_retries, sleep_time=sleep_time, fail_silently=fail_silently)
        self.supported_ticker_formats = [
            "cusip",
            "sedol",
            "ticker",
            "numerai_ticker",
            "bloomberg_ticker",
        ]

    def save_csv(self, dataf: pd.DataFrame, cols: list, file_name: str = "submission.csv", *args, **kwargs):
        """
        :param dataf: DataFrame which should have at least the following columns:
         1. One of supported ticker formats (cusip, sedol, ticker, numerai_ticker or bloomberg_ticker)
         2. signal (Values between 0 and 1 (exclusive))
         Additional columns for if you include validation data (optional):
         3. date (YYYY-MM-DD format date indication)
         4. data_type ('val' and 'live' partitions)

         :param cols: All cols that are saved in CSV.
         cols should contain at least 1 ticker column and a 'signal' column.
         For example: ['bloomberg_ticker', 'signal']
         :param file_name: .csv file path.
        """
        self._check_ticker_format(cols=cols)
        self._check_value_range(dataf=dataf, cols="signal")

        full_path = str(self.dir / file_name)
        print(f"Saving Signals predictions CSV to '{full_path}'.")
        dataf.loc[:, cols].reset_index(drop=True).to_csv(full_path, index=False, *args, **kwargs)

    def _check_ticker_format(self, cols: list):
        """Check for valid ticker format."""
        valid_tickers = set(cols).intersection(set(self.supported_ticker_formats))
        if not valid_tickers:
            raise NotImplementedError(
                f"No supported ticker format in {cols}). \
Supported: '{self.supported_ticker_formats}'"
            )

`_check_ticker_format(cols)`

Check for valid ticker format.

Source code in src/numerblox/submission.py

    def _check_ticker_format(self, cols: list):
        """Check for valid ticker format."""
        valid_tickers = set(cols).intersection(set(self.supported_ticker_formats))
        if not valid_tickers:
            raise NotImplementedError(
                f"No supported ticker format in {cols}). \
Supported: '{self.supported_ticker_formats}'"
            )

`save_csv(dataf, cols, file_name='submission.csv', *args, **kwargs)`

:param dataf: DataFrame which should have at least the following columns: 1. One of supported ticker formats (cusip, sedol, ticker, numerai_ticker or bloomberg_ticker) 2. signal (Values between 0 and 1 (exclusive)) Additional columns for if you include validation data (optional): 3. date (YYYY-MM-DD format date indication) 4. data_type ('val' and 'live' partitions)

:param cols: All cols that are saved in CSV. cols should contain at least 1 ticker column and a 'signal' column. For example: ['bloomberg_ticker', 'signal'] :param file_name: .csv file path.

Source code in src/numerblox/submission.py

def save_csv(self, dataf: pd.DataFrame, cols: list, file_name: str = "submission.csv", *args, **kwargs):
    """
    :param dataf: DataFrame which should have at least the following columns:
     1. One of supported ticker formats (cusip, sedol, ticker, numerai_ticker or bloomberg_ticker)
     2. signal (Values between 0 and 1 (exclusive))
     Additional columns for if you include validation data (optional):
     3. date (YYYY-MM-DD format date indication)
     4. data_type ('val' and 'live' partitions)

     :param cols: All cols that are saved in CSV.
     cols should contain at least 1 ticker column and a 'signal' column.
     For example: ['bloomberg_ticker', 'signal']
     :param file_name: .csv file path.
    """
    self._check_ticker_format(cols=cols)
    self._check_value_range(dataf=dataf, cols="signal")

    full_path = str(self.dir / file_name)
    print(f"Saving Signals predictions CSV to '{full_path}'.")
    dataf.loc[:, cols].reset_index(drop=True).to_csv(full_path, index=False, *args, **kwargs)

Model Upload

`NumeraiModelUpload`

A class to handle the uploading of machine learning models to Numerai's servers.

:param key: API key object containing public and secret keys for NumerAPI authentication. :param max_retries: Maximum number of attempts to upload the model. :param sleep_time: Number of seconds to wait between retries. :param fail_silently: Whether to suppress exceptions during upload.

Source code in src/numerblox/model_upload.py

class NumeraiModelUpload:
    """
    A class to handle the uploading of machine learning models to Numerai's servers.

    :param key: API key object containing public and secret keys for NumerAPI authentication.
    :param max_retries: Maximum number of attempts to upload the model.
    :param sleep_time: Number of seconds to wait between retries.
    :param fail_silently: Whether to suppress exceptions during upload.
    """

    def __init__(self, key: Key = None, max_retries: int = 2, sleep_time: int = 10, fail_silently: bool = False, *args, **kwargs):
        """
        Initializes the NumeraiModelUpload class with the necessary configuration.

        :param key: API key object containing public and secret keys for NumerAPI authentication.
        :param max_retries: Maximum number of retry attempts for model upload.
        :param sleep_time: Time (in seconds) to wait between retries.
        :param fail_silently: If True, suppress errors during model upload.
        :param *args: Additional arguments for NumerAPI.
        :param **kwargs: Additional keyword arguments for NumerAPI.
        """
        # Initialize NumerAPI with the provided keys and other arguments
        self.api = NumerAPI(public_id=key.pub_id, secret_key=key.secret_key, *args, **kwargs)
        self.max_retries = max_retries  # Set the maximum number of retries
        self.sleep_time = sleep_time  # Set the sleep time between retries
        self.fail_silently = fail_silently  # Determine whether to fail silently

    def create_and_upload_model(self, model: Any, feature_cols: Optional[List[str]] = None, model_name: str = None, file_path: str = None, data_version: str = None, docker_image: str = None, custom_predict_func: Callable[[pd.DataFrame], pd.DataFrame] = None) -> Union[str, None]:
        """
        Creates a model prediction function, serializes it, and uploads the model to Numerai.
        :param model: The machine learning model object.
        :param feature_cols: List of feature column names for predictions. Defaults to None.
        :param model_name: The name of the model to upload.
        :param file_path: The file path where the serialized model function will be saved.
        :param data_version: Data version to use for model upload.
        :param docker_image: Docker image to use for model upload.
        :param custom_predict_func: Custom prediction function to use instead of the model's predict method.

        :return: Upload ID if the upload is successful, None otherwise.
        """
        # Determine which prediction function to use
        if custom_predict_func is not None:
            predict = custom_predict_func  # Use custom prediction function if provided
        else:
            # Define default prediction function
            def predict(live_features: pd.DataFrame) -> pd.DataFrame:
                # Determine feature columns to use for predictions
                if feature_cols is None:
                    feature_cols_local = [col for col in live_features.columns if col.startswith("feature_")]
                else:
                    feature_cols_local = feature_cols

                # Predict using the model
                live_predictions = model.predict(live_features[feature_cols_local])

                # Rank predictions and convert to a DataFrame
                submission = pd.Series(live_predictions, index=live_features.index).rank(pct=True, method="first")
                return submission.to_frame("prediction")

        # Serialize the prediction function and save to the specified file path
        print(f"Serializing the predict function and saving to '{file_path}'")
        with open(file_path, "wb") as f:
            cloudpickle.dump(predict, f)

        # Get the model ID for the specified model name
        model_id = self._get_model_id(model_name=model_name)
        api_type = self.api.__class__.__name__  # Get the type of API being used
        print(f"{api_type}: Uploading model from '{file_path}' for model '{model_name}' (model_id='{model_id}')")

        # Attempt to upload the model, retrying if necessary
        for attempt in range(self.max_retries):
            try:
                # Attempt to upload the model
                upload_id = self.api.model_upload(file_path=file_path, model_id=model_id, data_version=data_version, docker_image=docker_image)
                print(f"{api_type} model upload of '{file_path}' for '{model_name}' is successful! Upload ID: {upload_id}")
                return upload_id  # Return upload ID if successful
            except Exception as e:
                # Handle failed upload attempts
                if attempt < self.max_retries - 1:
                    print(f"Failed to upload model '{file_path}' for '{model_name}' to Numerai. Retrying in {self.sleep_time} seconds...")
                    print(f"Error: {e}")
                    time.sleep(self.sleep_time)  # Wait before retrying
                else:
                    # Handle final failed attempt
                    if self.fail_silently:
                        print(f"Failed to upload model '{file_path}' for '{model_name}' to Numerai. Skipping...")
                        print(f"Error: {e}")
                    else:
                        print(f"Failed to upload model '{file_path}' for '{model_name}' after {self.max_retries} attempts.")
                        raise e  # Raise the exception if not failing silently

    def get_available_data_versions(self) -> dict:
        """
        Retrieves the available data versions for model uploads.

        :return: A dictionary of available data versions.
        """
        # Call NumerAPI to get available data versions
        return self.api.model_upload_data_versions()

    def get_available_docker_images(self) -> dict:
        """
        Retrieves the available Docker images for model uploads.

        :return: A dictionary of available Docker images.
        """
        # Call NumerAPI to get available Docker images
        return self.api.model_upload_docker_images()

    def _get_model_id(self, model_name: str) -> str:
        """
        Retrieves the model ID for a given model name.

        :param model_name: The name of the model.
        :return: The ID of the model.

        Raises ValueError if the model name is not found in the user's Numerai account.
        """
        # Get the mapping of model names to model IDs
        model_mapping = self.get_model_mapping
        if model_name in model_mapping:
            return model_mapping[model_name]  # Return the model ID if found
        else:
            # Raise an error if the model name is not found
            available_models = ", ".join(model_mapping.keys())
            raise ValueError(f"Model name '{model_name}' not found in your Numerai account. " f"Available model names: {available_models}")

    @property
    def get_model_mapping(self) -> dict:
        """
        Retrieves the mapping of model names to their IDs from the user's Numerai account.

        :return: A dictionary mapping model names to model IDs.
        """
        # Call NumerAPI to get the model mapping
        return self.api.get_models()

`get_model_mapping` `property`

Retrieves the mapping of model names to their IDs from the user's Numerai account.

:return: A dictionary mapping model names to model IDs.

`init(key=None, max_retries=2, sleep_time=10, fail_silently=False, *args, **kwargs)`

Initializes the NumeraiModelUpload class with the necessary configuration.

:param key: API key object containing public and secret keys for NumerAPI authentication. :param max_retries: Maximum number of retry attempts for model upload. :param sleep_time: Time (in seconds) to wait between retries. :param fail_silently: If True, suppress errors during model upload. :param args: Additional arguments for NumerAPI. :param *kwargs: Additional keyword arguments for NumerAPI.

Source code in src/numerblox/model_upload.py

def __init__(self, key: Key = None, max_retries: int = 2, sleep_time: int = 10, fail_silently: bool = False, *args, **kwargs):
    """
    Initializes the NumeraiModelUpload class with the necessary configuration.

    :param key: API key object containing public and secret keys for NumerAPI authentication.
    :param max_retries: Maximum number of retry attempts for model upload.
    :param sleep_time: Time (in seconds) to wait between retries.
    :param fail_silently: If True, suppress errors during model upload.
    :param *args: Additional arguments for NumerAPI.
    :param **kwargs: Additional keyword arguments for NumerAPI.
    """
    # Initialize NumerAPI with the provided keys and other arguments
    self.api = NumerAPI(public_id=key.pub_id, secret_key=key.secret_key, *args, **kwargs)
    self.max_retries = max_retries  # Set the maximum number of retries
    self.sleep_time = sleep_time  # Set the sleep time between retries
    self.fail_silently = fail_silently  # Determine whether to fail silently

`_get_model_id(model_name)`

Retrieves the model ID for a given model name.

:param model_name: The name of the model. :return: The ID of the model.

Raises ValueError if the model name is not found in the user's Numerai account.

Source code in src/numerblox/model_upload.py

def _get_model_id(self, model_name: str) -> str:
    """
    Retrieves the model ID for a given model name.

    :param model_name: The name of the model.
    :return: The ID of the model.

    Raises ValueError if the model name is not found in the user's Numerai account.
    """
    # Get the mapping of model names to model IDs
    model_mapping = self.get_model_mapping
    if model_name in model_mapping:
        return model_mapping[model_name]  # Return the model ID if found
    else:
        # Raise an error if the model name is not found
        available_models = ", ".join(model_mapping.keys())
        raise ValueError(f"Model name '{model_name}' not found in your Numerai account. " f"Available model names: {available_models}")

`create_and_upload_model(model, feature_cols=None, model_name=None, file_path=None, data_version=None, docker_image=None, custom_predict_func=None)`

Creates a model prediction function, serializes it, and uploads the model to Numerai. :param model: The machine learning model object. :param feature_cols: List of feature column names for predictions. Defaults to None. :param model_name: The name of the model to upload. :param file_path: The file path where the serialized model function will be saved. :param data_version: Data version to use for model upload. :param docker_image: Docker image to use for model upload. :param custom_predict_func: Custom prediction function to use instead of the model's predict method.

:return: Upload ID if the upload is successful, None otherwise.

Source code in src/numerblox/model_upload.py

def create_and_upload_model(self, model: Any, feature_cols: Optional[List[str]] = None, model_name: str = None, file_path: str = None, data_version: str = None, docker_image: str = None, custom_predict_func: Callable[[pd.DataFrame], pd.DataFrame] = None) -> Union[str, None]:
    """
    Creates a model prediction function, serializes it, and uploads the model to Numerai.
    :param model: The machine learning model object.
    :param feature_cols: List of feature column names for predictions. Defaults to None.
    :param model_name: The name of the model to upload.
    :param file_path: The file path where the serialized model function will be saved.
    :param data_version: Data version to use for model upload.
    :param docker_image: Docker image to use for model upload.
    :param custom_predict_func: Custom prediction function to use instead of the model's predict method.

    :return: Upload ID if the upload is successful, None otherwise.
    """
    # Determine which prediction function to use
    if custom_predict_func is not None:
        predict = custom_predict_func  # Use custom prediction function if provided
    else:
        # Define default prediction function
        def predict(live_features: pd.DataFrame) -> pd.DataFrame:
            # Determine feature columns to use for predictions
            if feature_cols is None:
                feature_cols_local = [col for col in live_features.columns if col.startswith("feature_")]
            else:
                feature_cols_local = feature_cols

            # Predict using the model
            live_predictions = model.predict(live_features[feature_cols_local])

            # Rank predictions and convert to a DataFrame
            submission = pd.Series(live_predictions, index=live_features.index).rank(pct=True, method="first")
            return submission.to_frame("prediction")

    # Serialize the prediction function and save to the specified file path
    print(f"Serializing the predict function and saving to '{file_path}'")
    with open(file_path, "wb") as f:
        cloudpickle.dump(predict, f)

    # Get the model ID for the specified model name
    model_id = self._get_model_id(model_name=model_name)
    api_type = self.api.__class__.__name__  # Get the type of API being used
    print(f"{api_type}: Uploading model from '{file_path}' for model '{model_name}' (model_id='{model_id}')")

    # Attempt to upload the model, retrying if necessary
    for attempt in range(self.max_retries):
        try:
            # Attempt to upload the model
            upload_id = self.api.model_upload(file_path=file_path, model_id=model_id, data_version=data_version, docker_image=docker_image)
            print(f"{api_type} model upload of '{file_path}' for '{model_name}' is successful! Upload ID: {upload_id}")
            return upload_id  # Return upload ID if successful
        except Exception as e:
            # Handle failed upload attempts
            if attempt < self.max_retries - 1:
                print(f"Failed to upload model '{file_path}' for '{model_name}' to Numerai. Retrying in {self.sleep_time} seconds...")
                print(f"Error: {e}")
                time.sleep(self.sleep_time)  # Wait before retrying
            else:
                # Handle final failed attempt
                if self.fail_silently:
                    print(f"Failed to upload model '{file_path}' for '{model_name}' to Numerai. Skipping...")
                    print(f"Error: {e}")
                else:
                    print(f"Failed to upload model '{file_path}' for '{model_name}' after {self.max_retries} attempts.")
                    raise e  # Raise the exception if not failing silently

`get_available_data_versions()`

Retrieves the available data versions for model uploads.

:return: A dictionary of available data versions.

Source code in src/numerblox/model_upload.py

def get_available_data_versions(self) -> dict:
    """
    Retrieves the available data versions for model uploads.

    :return: A dictionary of available data versions.
    """
    # Call NumerAPI to get available data versions
    return self.api.model_upload_data_versions()

`get_available_docker_images()`

Retrieves the available Docker images for model uploads.

:return: A dictionary of available Docker images.

Source code in src/numerblox/model_upload.py

def get_available_docker_images(self) -> dict:
    """
    Retrieves the available Docker images for model uploads.

    :return: A dictionary of available Docker images.
    """
    # Call NumerAPI to get available Docker images
    return self.api.model_upload_docker_images()

Models

`EraBoostedXGBRegressor`

Bases: XGBRegressor

Custom XGBRegressor model that upweights the worst eras in the data. The worst eras are determined by Corrv2. NOTE: Currently only supports single target regression.

This idea was first proposed by Richard Craib in the Numerai forums: https://forum.numer.ai/t/era-boosted-models/189

Credits to Michael Oliver (mdo) for proposing the 1st XGBoost implementation of era boosting: https://forum.numer.ai/t/era-boosted-models/189/3

:param proportion: Proportion of eras to upweight. :param trees_per_step: Number of trees to add per iteration. :param num_iters: Number of total era boosting iterations.

Source code in src/numerblox/models.py

class EraBoostedXGBRegressor(XGBRegressor):
    """
    Custom XGBRegressor model that upweights the worst eras in the data.
    The worst eras are determined by Corrv2.
    NOTE: Currently only supports single target regression.

    This idea was first proposed by Richard Craib in the Numerai forums:
    https://forum.numer.ai/t/era-boosted-models/189

    Credits to Michael Oliver (mdo) for proposing the 1st XGBoost implementation of era boosting:
    https://forum.numer.ai/t/era-boosted-models/189/3

    :param proportion: Proportion of eras to upweight.
    :param trees_per_step: Number of trees to add per iteration.
    :param num_iters: Number of total era boosting iterations.
    """

    def __init__(self, proportion=0.5, trees_per_step=10, num_iters=200, **xgb_params):
        sklearn.set_config(enable_metadata_routing=True)
        self.set_fit_request(era_series=True)
        super().__init__(**xgb_params)
        if not self.n_estimators:
            self.n_estimators = 100
        assert self.n_estimators >= 1, "n_estimators must be at least 1."

        assert 0 < proportion < 1, "proportion must be between 0 and 1."
        self.proportion = proportion
        assert trees_per_step >= 0, "trees_per_step must be at least 1."
        self.trees_per_step = trees_per_step
        assert num_iters >= 2, "num_iters must be at least 2."
        self.num_iters = num_iters

    def fit(self, X, y, era_series: pd.Series, **fit_params):
        super().fit(X, y, **fit_params)
        evaluator = NumeraiClassicEvaluator(era_col="era")
        self.feature_names = self.get_booster().feature_names
        iter_df = pd.DataFrame(X, columns=self.feature_names)
        iter_df["target"] = y
        iter_df["era"] = era_series

        for _ in range(self.num_iters - 1):
            preds = self.predict(X)
            iter_df["predictions"] = preds
            era_scores = pd.Series(index=iter_df["era"].unique())

            # Per era Corrv2 aka "Numerai Corr".
            era_scores = evaluator.per_era_numerai_corrs(dataf=iter_df, pred_col="predictions", target_col="target")
            # Filter on eras with worst Corrv2.
            era_scores.sort_values(inplace=True)
            worst_eras = era_scores[era_scores <= era_scores.quantile(self.proportion)].index
            worst_df = iter_df[iter_df["era"].isin(worst_eras)]

            # Add estimators and fit on worst eras.
            self.n_estimators += self.trees_per_step
            booster = self.get_booster()
            super().fit(worst_df.drop(columns=["target", "era", "predictions"]), worst_df["target"], xgb_model=booster, **fit_params)
        return self

    def get_feature_names_out(self, input_features=None):
        """Get output feature names for transformation."""
        check_is_fitted(self)
        return self.feature_names if not input_features else input_features

`get_feature_names_out(input_features=None)`

Get output feature names for transformation.

Source code in src/numerblox/models.py

def get_feature_names_out(self, input_features=None):
    """Get output feature names for transformation."""
    check_is_fitted(self)
    return self.feature_names if not input_features else input_features

Miscellaneous

`AttrDict`

Bases: dict

Access dictionary elements as attributes.

Source code in src/numerblox/misc.py

class AttrDict(dict):
    """Access dictionary elements as attributes."""

    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self

`Key`

Numerai credentials.

Source code in src/numerblox/misc.py

class Key:
    """Numerai credentials."""

    def __init__(self, pub_id: str, secret_key: str):
        self.pub_id = pub_id
        self.secret_key = secret_key

    def __repr__(self):
        return f"Numerai Auth Key. pub_id = '{self.pub_id}'"

    def __str__(self):
        return self.__repr__()

`load_key_from_json(file_path, *args, **kwargs)`

Initialize Key object from JSON file.

Credentials file must have the following format:

{"pub_id": "PUBLIC_ID", "secret_key": "SECRET_KEY"}

Source code in src/numerblox/misc.py

def load_key_from_json(file_path: str, *args, **kwargs):
    """
    Initialize Key object from JSON file. \n
    Credentials file must have the following format: \n
    `{"pub_id": "PUBLIC_ID", "secret_key": "SECRET_KEY"}`
    """
    with open(file_path) as json_file:
        json_data = json.load(json_file, *args, **kwargs)
    pub_id = json_data["pub_id"]
    secret_key = json_data["secret_key"]
    return Key(pub_id=pub_id, secret_key=secret_key)

Full API Reference

Download

BaseDownloader

__call__(*args, **kwargs)

_default_save_path(start, end, backend)

_load_json(file_path, verbose=False, *args, **kwargs) staticmethod

download_live_data(*args, **kwargs) abstractmethod

download_training_data(*args, **kwargs) abstractmethod

BaseIO

get_all_files property

is_empty property

_append_folder(folder)

_create_directory()

_get_dest_path(subfolder, filename)

_get_gcs_blob(bucket_name, blob_path)

download_directory_from_gcs(bucket_name, gcs_path)

download_file_from_gcs(bucket_name, gcs_path)

remove_base_directory()

upload_directory_to_gcs(bucket_name, gcs_path)

upload_file_to_gcs(bucket_name, gcs_path, local_path)

EODDownloader

download_live_data()

download_training_data(start=None)

generate_full_dataf(start)

generate_stock_dataf(ticker, start)

get_numerframe_data(start)

KaggleDownloader

download_live_data(kaggle_dataset_path)

download_training_data(kaggle_dataset_path)

NumeraiClassicDownloader

download_example_data(subfolder='', version='5.0', round_num=None)

download_live_data(subfolder='', version='5.0', round_num=None)

download_meta_model_preds(subfolder='', filename='v4.3/meta_model.parquet')

download_single_dataset(filename, dest_path, round_num=None)

download_training_data(subfolder='', version='5.0')

get_classic_features(subfolder='', filename='v5.0/features.json', *args, **kwargs)

NumeraiCryptoDownloader

download_live_data(subfolder='', version='1.0')

download_single_dataset(filename, dest_path)

download_training_data(subfolder='', version='1.0')

NumeraiSignalsDownloader

download_example_data(subfolder='', version='2.0')

download_live_data(subfolder='', version='2.0')

download_single_dataset(filename, dest_path)

download_training_data(subfolder='', version='2.0')

NumerFrame

NumerFrame

get_aux_data property

get_dates_from_era_col property

get_era_data property

get_eras_from_date_col property

get_feature_data property

get_fncv3_feature_data property

get_medium_feature_data property

get_prediction_aux_data property

get_prediction_data property

get_single_target_data property

get_small_feature_data property

get_target_data property

get_unique_eras property

__init_meta_attrs()

__set_era_col()

get_column_selection(cols)

get_date_from_era(era) staticmethod

get_date_range(start_date, end_date)

get_era_batch(eras, aemlp_batch=False, features=None, targets=None)

get_era_from_date(date_object) staticmethod

get_era_range(start_era, end_era)

get_feature_group(group)

get_feature_target_pair(multi_target=False)

get_last_n_eras(n)

get_pattern_data(pattern)

create_numerframe(file_path, columns=None, *args, **kwargs)

Preprocessing

Base Preprocessing

BasePreProcessor

Classic Preprocessing

GroupStatsPreProcessor

_add_group_features(X)

get_feature_names_out(input_features=None)

`BaseDownloader`

`call(*args, **kwargs)`

`_default_save_path(start, end, backend)`

`_load_json(file_path, verbose=False, *args, **kwargs)` `staticmethod`

`download_live_data(*args, **kwargs)` `abstractmethod`

`download_training_data(*args, **kwargs)` `abstractmethod`

`BaseIO`

`get_all_files` `property`

`is_empty` `property`

`_append_folder(folder)`

`_create_directory()`

`_get_dest_path(subfolder, filename)`

`_get_gcs_blob(bucket_name, blob_path)`

`download_directory_from_gcs(bucket_name, gcs_path)`

`download_file_from_gcs(bucket_name, gcs_path)`

`remove_base_directory()`

`upload_directory_to_gcs(bucket_name, gcs_path)`

`upload_file_to_gcs(bucket_name, gcs_path, local_path)`

`EODDownloader`

`download_live_data()`

`download_training_data(start=None)`

`generate_full_dataf(start)`

`generate_stock_dataf(ticker, start)`

`get_numerframe_data(start)`

`KaggleDownloader`

`download_live_data(kaggle_dataset_path)`

`download_training_data(kaggle_dataset_path)`

`NumeraiClassicDownloader`

`download_example_data(subfolder='', version='5.0', round_num=None)`

`download_live_data(subfolder='', version='5.0', round_num=None)`

`download_meta_model_preds(subfolder='', filename='v4.3/meta_model.parquet')`

`download_single_dataset(filename, dest_path, round_num=None)`

`download_training_data(subfolder='', version='5.0')`

`get_classic_features(subfolder='', filename='v5.0/features.json', *args, **kwargs)`

`NumeraiCryptoDownloader`

`download_live_data(subfolder='', version='1.0')`

`download_single_dataset(filename, dest_path)`

`download_training_data(subfolder='', version='1.0')`

`NumeraiSignalsDownloader`

`download_example_data(subfolder='', version='2.0')`

`download_live_data(subfolder='', version='2.0')`

`download_single_dataset(filename, dest_path)`

`download_training_data(subfolder='', version='2.0')`

`NumerFrame`

`get_aux_data` `property`

`get_dates_from_era_col` `property`

`get_era_data` `property`

`get_eras_from_date_col` `property`

`get_feature_data` `property`

`get_fncv3_feature_data` `property`

`get_medium_feature_data` `property`

`get_prediction_aux_data` `property`

`get_prediction_data` `property`

`get_single_target_data` `property`

`get_small_feature_data` `property`

`get_target_data` `property`

`get_unique_eras` `property`

`__init_meta_attrs()`

`__set_era_col()`

`get_column_selection(cols)`

`get_date_from_era(era)` `staticmethod`

`get_date_range(start_date, end_date)`

`get_era_batch(eras, aemlp_batch=False, features=None, targets=None)`

`get_era_from_date(date_object)` `staticmethod`

`get_era_range(start_era, end_era)`

`get_feature_group(group)`

`get_feature_target_pair(multi_target=False)`

`get_last_n_eras(n)`

`get_pattern_data(pattern)`

`create_numerframe(file_path, columns=None, *args, **kwargs)`

`BasePreProcessor`

`GroupStatsPreProcessor`

`_add_group_features(X)`

`get_feature_names_out(input_features=None)`

`transform(X)`

`DifferencePreProcessor`

`transform(X)`

`EraQuantileProcessor`

`_quantile_transform(group_data)`

`get_feature_names_out(input_features=None)`