Skip to content

Full API Reference

This section provides a detailed reference to all objects defined in NumerBlox.

BaseDownloader

Bases: BaseIO

Abstract base class for downloaders.

:param directory_path: Base folder to download files to.

Source code in numerblox/download.py
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
class BaseDownloader(BaseIO):
    """
    Abstract base class for downloaders.

    :param directory_path: Base folder to download files to.
    """
    def __init__(self, directory_path: str):
        super().__init__(directory_path=directory_path)

    @abstractmethod
    def download_training_data(self, *args, **kwargs):
        """ Download all necessary files needed for training. """
        ...

    @abstractmethod
    def download_live_data(self, *args, **kwargs):
        """ Download minimal amount of files needed for weekly inference. """
        ...

    @staticmethod
    def _load_json(file_path: str, verbose=False, *args, **kwargs) -> dict:
        """ Load JSON from file and return as dictionary. """
        with open(Path(file_path)) as json_file:
            json_data = json.load(json_file, *args, **kwargs)
        if verbose:
            print(json_data)
        return json_data

    def _default_save_path(self, start: dt, end: dt, backend: str):
        """ Save to downloader directory indicating backend, start date and end date as parquet file. """
        return f"{self.dir}/{backend}_{start.strftime('%Y%m%d')}_{end.strftime('%Y%m%d')}.parquet"

    def __call__(self, *args, **kwargs):
        """
        The most common use case will be to get weekly inference data. So calling the class itself returns inference data.
        """
        self.download_live_data(*args, **kwargs)

__call__(*args, **kwargs)

The most common use case will be to get weekly inference data. So calling the class itself returns inference data.

Source code in numerblox/download.py
162
163
164
165
166
def __call__(self, *args, **kwargs):
    """
    The most common use case will be to get weekly inference data. So calling the class itself returns inference data.
    """
    self.download_live_data(*args, **kwargs)

download_live_data(*args, **kwargs) abstractmethod

Download minimal amount of files needed for weekly inference.

Source code in numerblox/download.py
144
145
146
147
@abstractmethod
def download_live_data(self, *args, **kwargs):
    """ Download minimal amount of files needed for weekly inference. """
    ...

download_training_data(*args, **kwargs) abstractmethod

Download all necessary files needed for training.

Source code in numerblox/download.py
139
140
141
142
@abstractmethod
def download_training_data(self, *args, **kwargs):
    """ Download all necessary files needed for training. """
    ...

BaseIO

Bases: ABC

Basic functionality for IO (downloading and uploading).

:param directory_path: Base folder for IO. Will be created if it does not exist.

Source code in numerblox/download.py
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
class BaseIO(ABC):
    """
    Basic functionality for IO (downloading and uploading).

    :param directory_path: Base folder for IO. Will be created if it does not exist.
    """
    def __init__(self, directory_path: str):
        self.dir = Path(directory_path)
        self._create_directory()

    def remove_base_directory(self):
        """Remove directory with all contents."""
        abs_path = self.dir.resolve()
        print(
            f"WARNING: Deleting directory for '{self.__class__.__name__}'\nPath: '{abs_path}'"
        )
        shutil.rmtree(abs_path)

    def download_file_from_gcs(self, bucket_name: str, gcs_path: str):
        """
        Get file from GCS bucket and download to local directory.
        :param gcs_path: Path to file on GCS bucket.
        """
        blob_path = str(self.dir.resolve())
        blob = self._get_gcs_blob(bucket_name=bucket_name, blob_path=blob_path)
        blob.download_to_filename(gcs_path)
        print(
            f"Downloaded GCS object '{gcs_path}' from bucket '{blob.bucket.id}' to local directory '{blob_path}'."
        )

    def upload_file_to_gcs(self, bucket_name: str, gcs_path: str, local_path: str):
        """
        Upload file to some GCS bucket.
        :param gcs_path: Path to file on GCS bucket.
        """
        blob = self._get_gcs_blob(bucket_name=bucket_name, blob_path=gcs_path)
        blob.upload_from_filename(local_path)
        print(
            f"Local file '{local_path}' uploaded to '{gcs_path}' in bucket {blob.bucket.id}"
        )

    def download_directory_from_gcs(self, bucket_name: str, gcs_path: str):
        """
        Copy full directory from GCS bucket to local environment.
        :param gcs_path: Name of directory on GCS bucket.
        """
        blob_path = str(self.dir.resolve())
        blob = self._get_gcs_blob(bucket_name=bucket_name, blob_path=blob_path)
        for gcs_file in glob.glob(gcs_path + "/**", recursive=True):
            if os.path.isfile(gcs_file):
                blob.download_to_filename(blob_path)
        print(
            f"Directory '{gcs_path}' from bucket '{blob.bucket.id}' downloaded to '{blob_path}'"
        )

    def upload_directory_to_gcs(self, bucket_name: str, gcs_path: str):
        """
        Upload full base directory to GCS bucket.
        :param gcs_path: Name of directory on GCS bucket.
        """
        blob = self._get_gcs_blob(bucket_name=bucket_name, blob_path=gcs_path)
        for local_path in glob.glob(str(self.dir) + "/**", recursive=True):
            if os.path.isfile(local_path):
                blob.upload_from_filename(local_path)
        print(
            f"Directory '{self.dir}' uploaded to '{gcs_path}' in bucket {blob.bucket.id}"
        )

    def _get_gcs_blob(self, bucket_name: str, blob_path: str) -> storage.Blob:
        """ Create blob that interacts with Google Cloud Storage (GCS). """
        client = storage.Client()
        # https://console.cloud.google.com/storage/browser/[bucket_name]
        bucket = client.get_bucket(bucket_name)
        blob = bucket.blob(blob_path)
        return blob

    def _append_folder(self, folder: str) -> Path:
        """
        Return base directory Path object appended with 'folder'.
        Create directory if it does not exist.
        """
        dir = Path(self.dir / folder)
        dir.mkdir(parents=True, exist_ok=True)
        return dir

    def _get_dest_path(self, subfolder: str, filename: str) -> str:
        """ Prepare destination path for downloading. """
        dir = self._append_folder(subfolder)
        dest_path = str(dir.joinpath(filename.split("/")[-1]))
        return dest_path

    def _create_directory(self):
        """ Create base directory if it does not exist. """
        if not self.dir.is_dir():
            print(
                f"No existing directory found at '{self.dir}'. Creating directory..."
            )
            self.dir.mkdir(parents=True, exist_ok=True)

    @property
    def get_all_files(self) -> list:
        """ Return all paths of contents in directory. """
        return list(self.dir.iterdir())

    @property
    def is_empty(self) -> bool:
        """ Check if directory is empty. """
        return not bool(self.get_all_files)

get_all_files: list property

Return all paths of contents in directory.

is_empty: bool property

Check if directory is empty.

download_directory_from_gcs(bucket_name, gcs_path)

Copy full directory from GCS bucket to local environment. :param gcs_path: Name of directory on GCS bucket.

Source code in numerblox/download.py
61
62
63
64
65
66
67
68
69
70
71
72
73
def download_directory_from_gcs(self, bucket_name: str, gcs_path: str):
    """
    Copy full directory from GCS bucket to local environment.
    :param gcs_path: Name of directory on GCS bucket.
    """
    blob_path = str(self.dir.resolve())
    blob = self._get_gcs_blob(bucket_name=bucket_name, blob_path=blob_path)
    for gcs_file in glob.glob(gcs_path + "/**", recursive=True):
        if os.path.isfile(gcs_file):
            blob.download_to_filename(blob_path)
    print(
        f"Directory '{gcs_path}' from bucket '{blob.bucket.id}' downloaded to '{blob_path}'"
    )

download_file_from_gcs(bucket_name, gcs_path)

Get file from GCS bucket and download to local directory. :param gcs_path: Path to file on GCS bucket.

Source code in numerblox/download.py
38
39
40
41
42
43
44
45
46
47
48
def download_file_from_gcs(self, bucket_name: str, gcs_path: str):
    """
    Get file from GCS bucket and download to local directory.
    :param gcs_path: Path to file on GCS bucket.
    """
    blob_path = str(self.dir.resolve())
    blob = self._get_gcs_blob(bucket_name=bucket_name, blob_path=blob_path)
    blob.download_to_filename(gcs_path)
    print(
        f"Downloaded GCS object '{gcs_path}' from bucket '{blob.bucket.id}' to local directory '{blob_path}'."
    )

remove_base_directory()

Remove directory with all contents.

Source code in numerblox/download.py
30
31
32
33
34
35
36
def remove_base_directory(self):
    """Remove directory with all contents."""
    abs_path = self.dir.resolve()
    print(
        f"WARNING: Deleting directory for '{self.__class__.__name__}'\nPath: '{abs_path}'"
    )
    shutil.rmtree(abs_path)

upload_directory_to_gcs(bucket_name, gcs_path)

Upload full base directory to GCS bucket. :param gcs_path: Name of directory on GCS bucket.

Source code in numerblox/download.py
75
76
77
78
79
80
81
82
83
84
85
86
def upload_directory_to_gcs(self, bucket_name: str, gcs_path: str):
    """
    Upload full base directory to GCS bucket.
    :param gcs_path: Name of directory on GCS bucket.
    """
    blob = self._get_gcs_blob(bucket_name=bucket_name, blob_path=gcs_path)
    for local_path in glob.glob(str(self.dir) + "/**", recursive=True):
        if os.path.isfile(local_path):
            blob.upload_from_filename(local_path)
    print(
        f"Directory '{self.dir}' uploaded to '{gcs_path}' in bucket {blob.bucket.id}"
    )

upload_file_to_gcs(bucket_name, gcs_path, local_path)

Upload file to some GCS bucket. :param gcs_path: Path to file on GCS bucket.

Source code in numerblox/download.py
50
51
52
53
54
55
56
57
58
59
def upload_file_to_gcs(self, bucket_name: str, gcs_path: str, local_path: str):
    """
    Upload file to some GCS bucket.
    :param gcs_path: Path to file on GCS bucket.
    """
    blob = self._get_gcs_blob(bucket_name=bucket_name, blob_path=gcs_path)
    blob.upload_from_filename(local_path)
    print(
        f"Local file '{local_path}' uploaded to '{gcs_path}' in bucket {blob.bucket.id}"
    )

EODDownloader

Bases: BaseDownloader

Download data from EOD historical data.

More info: https://eodhistoricaldata.com/

Make sure you have the underlying Python package installed. pip install eod.

:param directory_path: Base folder to download files to.

:param key: Valid EOD client key.

:param tickers: List of valid EOD tickers (Bloomberg ticker format).

:param frequency: Choose from [d, w, m].

Daily data by default.

Source code in numerblox/download.py
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
class EODDownloader(BaseDownloader):
    """
    Download data from EOD historical data. \n
    More info: https://eodhistoricaldata.com/

    Make sure you have the underlying Python package installed.
    `pip install eod`.

    :param directory_path: Base folder to download files to. \n
    :param key: Valid EOD client key. \n
    :param tickers: List of valid EOD tickers (Bloomberg ticker format). \n
    :param frequency: Choose from [d, w, m]. \n
    Daily data by default.
    """
    def __init__(self,
                 directory_path: str,
                 key: str,
                 tickers: list,
                 frequency: str = "d"):
        super().__init__(directory_path=directory_path)
        self.key = key
        self.tickers = tickers
        try: 
            from eod import EodHistoricalData
        except ImportError:
            raise ImportError("Could not import eod package. Please install eod package with 'pip install eod'")
        self.client = EodHistoricalData(self.key)
        self.frequency = frequency
        self.current_time = dt.now()
        self.end_date = self.current_time.strftime("%Y-%m-%d")
        self.cpu_count = os.cpu_count()
        # Time to sleep in between API calls to avoid hitting EOD rate limits.
        # EOD rate limit is set at 1000 calls per minute.
        self.sleep_time = self.cpu_count / 32

    def download_live_data(self):
        """ Download one year of data for defined tickers. """
        start = (pd.Timestamp(self.current_time) - relativedelta(years=1)).strftime("%Y-%m-%d")
        dataf = self.get_numerframe_data(start=start)
        dataf.to_parquet(self._default_save_path(start=pd.Timestamp(start),
                                                 end=pd.Timestamp(self.end_date),
                                                 backend="eod"))

    def download_training_data(self, start: str = None):
        """
        Download full date length available.
        start: Starting data in %Y-%m-%d format.
        """
        start = start if start else "1970-01-01"
        dataf = self.generate_full_dataf(start=start)
        dataf.to_parquet(self._default_save_path(start=pd.Timestamp(start),
                                                 end=pd.Timestamp(self.end_date),
                                                 backend="eod"))

    def get_numerframe_data(self, start: str) -> NumerFrame:
        """
        Get NumerFrame data from some starting date.
        start: Starting data in %Y-%m-%d format.
        """
        dataf = self.generate_full_dataf(start=start)
        return NumerFrame(dataf)

    def generate_full_dataf(self, start: str) -> pd.DataFrame:
        """
        Collect all price data for list of EOD ticker symbols (Bloomberg tickers).
        start: Starting data in %Y-%m-%d format.
        """
        price_datafs = []
        with ThreadPoolExecutor(max_workers=self.cpu_count) as executor:
            tasks = [executor.submit(self.generate_stock_dataf, ticker, start) for ticker in self.tickers]
            for task in tqdm(concurrent.futures.as_completed(tasks),
                             total=len(self.tickers),
                             desc="EOD price data extraction"):
                price_datafs.append(task.result())
        return pd.concat(price_datafs)

    def generate_stock_dataf(self, ticker: str, start: str) -> pd.DataFrame:
        """
        Generate Price DataFrame for a single ticker.
        ticker: EOD ticker symbol (Bloomberg tickers).
        For example, Apple stock = AAPL.US.
        start: Starting data in %Y-%m-%d format.
        """
        time.sleep(self.sleep_time)
        try:
            resp = self.client.get_prices_eod(ticker, period=self.frequency,
                                              from_=start, to=self.end_date)
            stock_df = pd.DataFrame(resp).set_index('date')
            stock_df['ticker'] = ticker
        except Exception as e:
            print(f"WARNING: Date pull failed on ticker: '{ticker}'. Exception: {e}")
            stock_df = pd.DataFrame()
        return stock_df

download_live_data()

Download one year of data for defined tickers.

Source code in numerblox/download.py
506
507
508
509
510
511
512
def download_live_data(self):
    """ Download one year of data for defined tickers. """
    start = (pd.Timestamp(self.current_time) - relativedelta(years=1)).strftime("%Y-%m-%d")
    dataf = self.get_numerframe_data(start=start)
    dataf.to_parquet(self._default_save_path(start=pd.Timestamp(start),
                                             end=pd.Timestamp(self.end_date),
                                             backend="eod"))

download_training_data(start=None)

Download full date length available. start: Starting data in %Y-%m-%d format.

Source code in numerblox/download.py
514
515
516
517
518
519
520
521
522
523
def download_training_data(self, start: str = None):
    """
    Download full date length available.
    start: Starting data in %Y-%m-%d format.
    """
    start = start if start else "1970-01-01"
    dataf = self.generate_full_dataf(start=start)
    dataf.to_parquet(self._default_save_path(start=pd.Timestamp(start),
                                             end=pd.Timestamp(self.end_date),
                                             backend="eod"))

generate_full_dataf(start)

Collect all price data for list of EOD ticker symbols (Bloomberg tickers). start: Starting data in %Y-%m-%d format.

Source code in numerblox/download.py
533
534
535
536
537
538
539
540
541
542
543
544
545
def generate_full_dataf(self, start: str) -> pd.DataFrame:
    """
    Collect all price data for list of EOD ticker symbols (Bloomberg tickers).
    start: Starting data in %Y-%m-%d format.
    """
    price_datafs = []
    with ThreadPoolExecutor(max_workers=self.cpu_count) as executor:
        tasks = [executor.submit(self.generate_stock_dataf, ticker, start) for ticker in self.tickers]
        for task in tqdm(concurrent.futures.as_completed(tasks),
                         total=len(self.tickers),
                         desc="EOD price data extraction"):
            price_datafs.append(task.result())
    return pd.concat(price_datafs)

generate_stock_dataf(ticker, start)

Generate Price DataFrame for a single ticker. ticker: EOD ticker symbol (Bloomberg tickers). For example, Apple stock = AAPL.US. start: Starting data in %Y-%m-%d format.

Source code in numerblox/download.py
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
def generate_stock_dataf(self, ticker: str, start: str) -> pd.DataFrame:
    """
    Generate Price DataFrame for a single ticker.
    ticker: EOD ticker symbol (Bloomberg tickers).
    For example, Apple stock = AAPL.US.
    start: Starting data in %Y-%m-%d format.
    """
    time.sleep(self.sleep_time)
    try:
        resp = self.client.get_prices_eod(ticker, period=self.frequency,
                                          from_=start, to=self.end_date)
        stock_df = pd.DataFrame(resp).set_index('date')
        stock_df['ticker'] = ticker
    except Exception as e:
        print(f"WARNING: Date pull failed on ticker: '{ticker}'. Exception: {e}")
        stock_df = pd.DataFrame()
    return stock_df

get_numerframe_data(start)

Get NumerFrame data from some starting date. start: Starting data in %Y-%m-%d format.

Source code in numerblox/download.py
525
526
527
528
529
530
531
def get_numerframe_data(self, start: str) -> NumerFrame:
    """
    Get NumerFrame data from some starting date.
    start: Starting data in %Y-%m-%d format.
    """
    dataf = self.generate_full_dataf(start=start)
    return NumerFrame(dataf)

KaggleDownloader

Bases: BaseDownloader

Download financial data from Kaggle.

For authentication, make sure you have a directory called .kaggle in your home directory with therein a kaggle.json file. kaggle.json should have the following structure:

{"username": USERNAME, "key": KAGGLE_API_KEY}

More info on authentication: github.com/Kaggle/kaggle-api#api-credentials

More info on the Kaggle Python API: kaggle.com/donkeys/kaggle-python-api

:param directory_path: Base folder to download files to.

Source code in numerblox/download.py
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
class KaggleDownloader(BaseDownloader):
    """
    Download financial data from Kaggle.

    For authentication, make sure you have a directory called .kaggle in your home directory
    with therein a kaggle.json file. kaggle.json should have the following structure: \n
    `{"username": USERNAME, "key": KAGGLE_API_KEY}` \n
    More info on authentication: github.com/Kaggle/kaggle-api#api-credentials \n

    More info on the Kaggle Python API: kaggle.com/donkeys/kaggle-python-api \n

    :param directory_path: Base folder to download files to.
    """
    def __init__(self, directory_path: str):
        self.__check_kaggle_import()
        super().__init__(directory_path=directory_path)

    def download_live_data(self, kaggle_dataset_path: str):
        """
        Download arbitrary Kaggle dataset.
        :param kaggle_dataset_path: Path on Kaggle (URL slug on kaggle.com/)
        """
        self.download_training_data(kaggle_dataset_path)

    def download_training_data(self, kaggle_dataset_path: str):
        """
        Download arbitrary Kaggle dataset.
        :param kaggle_dataset_path: Path on Kaggle (URL slug on kaggle.com/)
        """
        import kaggle
        kaggle.api.dataset_download_files(kaggle_dataset_path,
                                          path=self.dir, unzip=True)

    @staticmethod
    def __check_kaggle_import():
        try:
            import kaggle
        except OSError:
            raise OSError("Could not find kaggle.json credentials. Make sure it's located in /home/runner/.kaggle. Or use the environment method. Check github.com/Kaggle/kaggle-api#api-credentials for more information on authentication.")

download_live_data(kaggle_dataset_path)

Download arbitrary Kaggle dataset. :param kaggle_dataset_path: Path on Kaggle (URL slug on kaggle.com/)

Source code in numerblox/download.py
447
448
449
450
451
452
def download_live_data(self, kaggle_dataset_path: str):
    """
    Download arbitrary Kaggle dataset.
    :param kaggle_dataset_path: Path on Kaggle (URL slug on kaggle.com/)
    """
    self.download_training_data(kaggle_dataset_path)

download_training_data(kaggle_dataset_path)

Download arbitrary Kaggle dataset. :param kaggle_dataset_path: Path on Kaggle (URL slug on kaggle.com/)

Source code in numerblox/download.py
454
455
456
457
458
459
460
461
def download_training_data(self, kaggle_dataset_path: str):
    """
    Download arbitrary Kaggle dataset.
    :param kaggle_dataset_path: Path on Kaggle (URL slug on kaggle.com/)
    """
    import kaggle
    kaggle.api.dataset_download_files(kaggle_dataset_path,
                                      path=self.dir, unzip=True)

NumeraiClassicDownloader

Bases: BaseDownloader

WARNING: Versions 1-3 (legacy data) are deprecated. Only supporting version 4+.

Downloading from NumerAPI for Numerai Classic data.

:param directory_path: Base folder to download files to.

All kwargs will be passed to NumerAPI initialization.

Source code in numerblox/download.py
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
class NumeraiClassicDownloader(BaseDownloader):
    """
    WARNING: Versions 1-3 (legacy data) are deprecated. Only supporting version 4+.

    Downloading from NumerAPI for Numerai Classic data. \n
    :param directory_path: Base folder to download files to. \n
    All kwargs will be passed to NumerAPI initialization.
    """
    TRAIN_DATASET_NAME = "train_int8.parquet"
    VALIDATION_DATASET_NAME = "validation_int8.parquet"
    LIVE_DATASET_NAME = "live_int8.parquet"
    LIVE_EXAMPLE_PREDS_NAME = "live_example_preds.parquet"
    VALIDATION_EXAMPLE_PREDS_NAME = "validation_example_preds.parquet"

    def __init__(self, directory_path: str, **kwargs):
        super().__init__(directory_path=directory_path)
        self.napi = NumerAPI(**kwargs)
        self.current_round = self.napi.get_current_round()
        # Get all available versions available for Numerai.
        self.dataset_versions = set(s.split("/")[0] for s in self.napi.list_datasets())
        self.dataset_versions.discard("signals")

    def download_training_data(
        self, subfolder: str = "", version: str = "4.3"
    ):
        """
        Get Numerai classic training and validation data.
        :param subfolder: Specify folder to create folder within base directory root.
        Saves in base directory root by default.
        :param version: Numerai dataset version.
        4 = April 2022 dataset
        4.1 = Sunshine dataset
        4.2 (default) = Rain Dataset
        4.3 = Midnight dataset
        """
        self._check_dataset_version(version)
        train_val_files = [f"v{version}/{self.TRAIN_DATASET_NAME}",
                           f"v{version}/{self.VALIDATION_DATASET_NAME}"]
        for file in train_val_files:
            dest_path = self._get_dest_path(subfolder, file)
            self.download_single_dataset(
                filename=file,
                dest_path=dest_path
            )

    def download_single_dataset(
        self, filename: str, dest_path: str, round_num: int = None
    ):
        """
        Download one of the available datasets through NumerAPI.

        :param filename: Name as listed in NumerAPI (Check NumerAPI().list_datasets() for full overview)
        :param dest_path: Full path where file will be saved.
        :param round_num: Numerai tournament round number. Downloads latest round by default.
        """
        print(
            f"Downloading '{filename}'."
        )
        self.napi.download_dataset(
            filename=filename,
            dest_path=dest_path,
            round_num=round_num
        )

    def download_live_data(
            self,
            subfolder: str = "",
            version: str = "4.3",
            round_num: int = None
    ):
        """
        Download all live data in specified folder for given version (i.e. minimal data needed for inference).

        :param subfolder: Specify folder to create folder within directory root.
        Saves in directory root by default.
        :param version: Numerai dataset version. 
        4 = April 2022 dataset
        4.1 = Sunshine dataset
        4.2 (default) = Rain Dataset
        4.3 = Midnight dataset
        :param round_num: Numerai tournament round number. Downloads latest round by default.
        """
        self._check_dataset_version(version)
        live_files = [f"v{version}/{self.LIVE_DATASET_NAME}"]
        for file in live_files:
            dest_path = self._get_dest_path(subfolder, file)
            self.download_single_dataset(
                filename=file,
                dest_path=dest_path,
                round_num=round_num
            )

    def download_example_data(
        self, subfolder: str = "", version: str = "4.3", round_num: int = None
    ):
        """
        Download all example prediction data in specified folder for given version.

        :param subfolder: Specify folder to create folder within base directory root.
        Saves in base directory root by default.
        :param version: Numerai dataset version.
        4 = April 2022 dataset
        4.1 = Sunshine dataset
        4.2 (default) = Rain Dataset
        4.3 = Midnight dataset
        :param round_num: Numerai tournament round number. Downloads latest round by default.
        """
        self._check_dataset_version(version)
        example_files = [f"v{version}/{self.LIVE_EXAMPLE_PREDS_NAME}", 
                         f"v{version}/{self.VALIDATION_EXAMPLE_PREDS_NAME}"]
        for file in example_files:
            dest_path = self._get_dest_path(subfolder, file)
            self.download_single_dataset(
                filename=file,
                dest_path=dest_path,
                round_num=round_num
            )

    def get_classic_features(self, subfolder: str = "", filename="v4.3/features.json", *args, **kwargs) -> dict:
        """
        Download feature overview (stats and feature sets) through NumerAPI and load as dict.
        :param subfolder: Specify folder to create folder within base directory root.
        Saves in base directory root by default.
        :param filename: name for feature overview.
        *args, **kwargs will be passed to the JSON loader.
        :return: Feature overview dict
        """
        version = filename.split("/")[0].replace("v", "")
        self._check_dataset_version(version)
        dest_path = self._get_dest_path(subfolder, filename)
        self.download_single_dataset(filename=filename,
                                     dest_path=dest_path)
        json_data = self._load_json(dest_path, *args, **kwargs)
        return json_data

    def download_meta_model_preds(self, subfolder: str = "", filename="v4.3/meta_model.parquet") -> pd.DataFrame:
        """
        Download Meta model predictions through NumerAPI.
        :param subfolder: Specify folder to create folder within base directory root.
        Saves in base directory root by default.
        :param filename: name for meta model predictions file.
        :return: Meta model predictions as DataFrame.
        """
        version = filename.split("/")[0].replace("v", "")
        self._check_dataset_version(version)
        dest_path = self._get_dest_path(subfolder, filename)
        self.download_single_dataset(
            filename=filename,
            dest_path=dest_path,
            )
        return pd.read_parquet(dest_path)

    def _check_dataset_version(self, version: str):
        assert f"v{version}" in self.dataset_versions, f"Version '{version}' is not available in NumerAPI."

download_example_data(subfolder='', version='4.3', round_num=None)

Download all example prediction data in specified folder for given version.

:param subfolder: Specify folder to create folder within base directory root. Saves in base directory root by default. :param version: Numerai dataset version. 4 = April 2022 dataset 4.1 = Sunshine dataset 4.2 (default) = Rain Dataset 4.3 = Midnight dataset :param round_num: Numerai tournament round number. Downloads latest round by default.

Source code in numerblox/download.py
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
def download_example_data(
    self, subfolder: str = "", version: str = "4.3", round_num: int = None
):
    """
    Download all example prediction data in specified folder for given version.

    :param subfolder: Specify folder to create folder within base directory root.
    Saves in base directory root by default.
    :param version: Numerai dataset version.
    4 = April 2022 dataset
    4.1 = Sunshine dataset
    4.2 (default) = Rain Dataset
    4.3 = Midnight dataset
    :param round_num: Numerai tournament round number. Downloads latest round by default.
    """
    self._check_dataset_version(version)
    example_files = [f"v{version}/{self.LIVE_EXAMPLE_PREDS_NAME}", 
                     f"v{version}/{self.VALIDATION_EXAMPLE_PREDS_NAME}"]
    for file in example_files:
        dest_path = self._get_dest_path(subfolder, file)
        self.download_single_dataset(
            filename=file,
            dest_path=dest_path,
            round_num=round_num
        )

download_live_data(subfolder='', version='4.3', round_num=None)

Download all live data in specified folder for given version (i.e. minimal data needed for inference).

:param subfolder: Specify folder to create folder within directory root. Saves in directory root by default. :param version: Numerai dataset version. 4 = April 2022 dataset 4.1 = Sunshine dataset 4.2 (default) = Rain Dataset 4.3 = Midnight dataset :param round_num: Numerai tournament round number. Downloads latest round by default.

Source code in numerblox/download.py
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
def download_live_data(
        self,
        subfolder: str = "",
        version: str = "4.3",
        round_num: int = None
):
    """
    Download all live data in specified folder for given version (i.e. minimal data needed for inference).

    :param subfolder: Specify folder to create folder within directory root.
    Saves in directory root by default.
    :param version: Numerai dataset version. 
    4 = April 2022 dataset
    4.1 = Sunshine dataset
    4.2 (default) = Rain Dataset
    4.3 = Midnight dataset
    :param round_num: Numerai tournament round number. Downloads latest round by default.
    """
    self._check_dataset_version(version)
    live_files = [f"v{version}/{self.LIVE_DATASET_NAME}"]
    for file in live_files:
        dest_path = self._get_dest_path(subfolder, file)
        self.download_single_dataset(
            filename=file,
            dest_path=dest_path,
            round_num=round_num
        )

download_meta_model_preds(subfolder='', filename='v4.3/meta_model.parquet')

Download Meta model predictions through NumerAPI. :param subfolder: Specify folder to create folder within base directory root. Saves in base directory root by default. :param filename: name for meta model predictions file. :return: Meta model predictions as DataFrame.

Source code in numerblox/download.py
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
def download_meta_model_preds(self, subfolder: str = "", filename="v4.3/meta_model.parquet") -> pd.DataFrame:
    """
    Download Meta model predictions through NumerAPI.
    :param subfolder: Specify folder to create folder within base directory root.
    Saves in base directory root by default.
    :param filename: name for meta model predictions file.
    :return: Meta model predictions as DataFrame.
    """
    version = filename.split("/")[0].replace("v", "")
    self._check_dataset_version(version)
    dest_path = self._get_dest_path(subfolder, filename)
    self.download_single_dataset(
        filename=filename,
        dest_path=dest_path,
        )
    return pd.read_parquet(dest_path)

download_single_dataset(filename, dest_path, round_num=None)

Download one of the available datasets through NumerAPI.

:param filename: Name as listed in NumerAPI (Check NumerAPI().list_datasets() for full overview) :param dest_path: Full path where file will be saved. :param round_num: Numerai tournament round number. Downloads latest round by default.

Source code in numerblox/download.py
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
def download_single_dataset(
    self, filename: str, dest_path: str, round_num: int = None
):
    """
    Download one of the available datasets through NumerAPI.

    :param filename: Name as listed in NumerAPI (Check NumerAPI().list_datasets() for full overview)
    :param dest_path: Full path where file will be saved.
    :param round_num: Numerai tournament round number. Downloads latest round by default.
    """
    print(
        f"Downloading '{filename}'."
    )
    self.napi.download_dataset(
        filename=filename,
        dest_path=dest_path,
        round_num=round_num
    )

download_training_data(subfolder='', version='4.3')

Get Numerai classic training and validation data. :param subfolder: Specify folder to create folder within base directory root. Saves in base directory root by default. :param version: Numerai dataset version. 4 = April 2022 dataset 4.1 = Sunshine dataset 4.2 (default) = Rain Dataset 4.3 = Midnight dataset

Source code in numerblox/download.py
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
def download_training_data(
    self, subfolder: str = "", version: str = "4.3"
):
    """
    Get Numerai classic training and validation data.
    :param subfolder: Specify folder to create folder within base directory root.
    Saves in base directory root by default.
    :param version: Numerai dataset version.
    4 = April 2022 dataset
    4.1 = Sunshine dataset
    4.2 (default) = Rain Dataset
    4.3 = Midnight dataset
    """
    self._check_dataset_version(version)
    train_val_files = [f"v{version}/{self.TRAIN_DATASET_NAME}",
                       f"v{version}/{self.VALIDATION_DATASET_NAME}"]
    for file in train_val_files:
        dest_path = self._get_dest_path(subfolder, file)
        self.download_single_dataset(
            filename=file,
            dest_path=dest_path
        )

get_classic_features(subfolder='', filename='v4.3/features.json', *args, **kwargs)

Download feature overview (stats and feature sets) through NumerAPI and load as dict. :param subfolder: Specify folder to create folder within base directory root. Saves in base directory root by default. :param filename: name for feature overview. args, *kwargs will be passed to the JSON loader. :return: Feature overview dict

Source code in numerblox/download.py
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
def get_classic_features(self, subfolder: str = "", filename="v4.3/features.json", *args, **kwargs) -> dict:
    """
    Download feature overview (stats and feature sets) through NumerAPI and load as dict.
    :param subfolder: Specify folder to create folder within base directory root.
    Saves in base directory root by default.
    :param filename: name for feature overview.
    *args, **kwargs will be passed to the JSON loader.
    :return: Feature overview dict
    """
    version = filename.split("/")[0].replace("v", "")
    self._check_dataset_version(version)
    dest_path = self._get_dest_path(subfolder, filename)
    self.download_single_dataset(filename=filename,
                                 dest_path=dest_path)
    json_data = self._load_json(dest_path, *args, **kwargs)
    return json_data

NumeraiSignalsDownloader

Bases: BaseDownloader

Support for Numerai Signals v1 parquet data.

Downloading from SignalsAPI for Numerai Signals data.

:param directory_path: Base folder to download files to.

All kwargs will be passed to SignalsAPI initialization.

Source code in numerblox/download.py
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
class NumeraiSignalsDownloader(BaseDownloader):
    """
    Support for Numerai Signals v1 parquet data.

    Downloading from SignalsAPI for Numerai Signals data. \n
    :param directory_path: Base folder to download files to. \n
    All kwargs will be passed to SignalsAPI initialization.
    """
    TRAIN_DATASET_NAME = "train.parquet"
    VALIDATION_DATASET_NAME = "validation.parquet"
    LIVE_DATASET_NAME = "live.parquet"
    LIVE_EXAMPLE_PREDS_NAME = "live_example_preds.parquet"
    VALIDATION_EXAMPLE_PREDS_NAME = "validation_example_preds.parquet"

    def __init__(self, directory_path: str, **kwargs):
        super().__init__(directory_path=directory_path)
        self.sapi = SignalsAPI(**kwargs)
        self.current_round = self.sapi.get_current_round()
        # Get all available versions available for Numerai Signals.
        self.dataset_versions = set(s.replace("signals/", "").split("/")[0] for s in self.sapi.list_datasets() if s.startswith("signals/v"))

    def download_training_data(
        self, subfolder: str = "", version: str = "1.0"
    ):
        """
        Get Numerai Signals training and validation data.
        :param subfolder: Specify folder to create folder within base directory root.
        Saves in base directory root by default.
        :param version: Numerai Signals dataset version.
        Currently only v1.0 is supported.
        """
        self._check_dataset_version(version)
        train_val_files = [f"signals/v{version}/{self.TRAIN_DATASET_NAME}",
                           f"signals/v{version}/{self.VALIDATION_DATASET_NAME}"]
        for file in train_val_files:
            dest_path = self._get_dest_path(subfolder, file)
            self.download_single_dataset(
                filename=file,
                dest_path=dest_path
            )

    def download_single_dataset(
        self, filename: str, dest_path: str
    ):
        """
        Download one of the available datasets through SignalsAPI.

        :param filename: Name as listed in NumerAPI (Check NumerAPI().list_datasets() for full overview)
        :param dest_path: Full path where file will be saved.
        """
        print(
            f"Downloading '{filename}'."
        )
        self.sapi.download_dataset(
            filename=filename,
            dest_path=dest_path,
        )

    def download_live_data(
            self,
            subfolder: str = "",
            version: str = "1.0",
    ):
        """
        Download all live data in specified folder (i.e. minimal data needed for inference).

        :param subfolder: Specify folder to create folder within directory root.
        Saves in directory root by default.
        :param version: Numerai dataset version. 
        Currently only v1.0 is supported.
        """
        self._check_dataset_version(version)
        live_files = [f"signals/v{version}/{self.LIVE_DATASET_NAME}"]
        for file in live_files:
            dest_path = self._get_dest_path(subfolder, file)
            self.download_single_dataset(
                filename=file,
                dest_path=dest_path,
            )

    def download_example_data(
        self, subfolder: str = "", version: str = "1.0"
    ):
        """
        Download all example prediction data in specified folder for given version.

        :param subfolder: Specify folder to create folder within base directory root.
        Saves in base directory root by default.
        :param version: Numerai dataset version.
        Currently only v1.0 is supported.
        """
        self._check_dataset_version(version)
        example_files = [f"signals/v{version}/{self.LIVE_EXAMPLE_PREDS_NAME}", 
                         f"signals/v{version}/{self.VALIDATION_EXAMPLE_PREDS_NAME}"]
        for file in example_files:
            dest_path = self._get_dest_path(subfolder, file)
            self.download_single_dataset(
                filename=file,
                dest_path=dest_path,
            )

    def _check_dataset_version(self, version: str):
        assert f"v{version}" in self.dataset_versions, f"Version '{version}' is not available in SignalsAPI."

download_example_data(subfolder='', version='1.0')

Download all example prediction data in specified folder for given version.

:param subfolder: Specify folder to create folder within base directory root. Saves in base directory root by default. :param version: Numerai dataset version. Currently only v1.0 is supported.

Source code in numerblox/download.py
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
def download_example_data(
    self, subfolder: str = "", version: str = "1.0"
):
    """
    Download all example prediction data in specified folder for given version.

    :param subfolder: Specify folder to create folder within base directory root.
    Saves in base directory root by default.
    :param version: Numerai dataset version.
    Currently only v1.0 is supported.
    """
    self._check_dataset_version(version)
    example_files = [f"signals/v{version}/{self.LIVE_EXAMPLE_PREDS_NAME}", 
                     f"signals/v{version}/{self.VALIDATION_EXAMPLE_PREDS_NAME}"]
    for file in example_files:
        dest_path = self._get_dest_path(subfolder, file)
        self.download_single_dataset(
            filename=file,
            dest_path=dest_path,
        )

download_live_data(subfolder='', version='1.0')

Download all live data in specified folder (i.e. minimal data needed for inference).

:param subfolder: Specify folder to create folder within directory root. Saves in directory root by default. :param version: Numerai dataset version. Currently only v1.0 is supported.

Source code in numerblox/download.py
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
def download_live_data(
        self,
        subfolder: str = "",
        version: str = "1.0",
):
    """
    Download all live data in specified folder (i.e. minimal data needed for inference).

    :param subfolder: Specify folder to create folder within directory root.
    Saves in directory root by default.
    :param version: Numerai dataset version. 
    Currently only v1.0 is supported.
    """
    self._check_dataset_version(version)
    live_files = [f"signals/v{version}/{self.LIVE_DATASET_NAME}"]
    for file in live_files:
        dest_path = self._get_dest_path(subfolder, file)
        self.download_single_dataset(
            filename=file,
            dest_path=dest_path,
        )

download_single_dataset(filename, dest_path)

Download one of the available datasets through SignalsAPI.

:param filename: Name as listed in NumerAPI (Check NumerAPI().list_datasets() for full overview) :param dest_path: Full path where file will be saved.

Source code in numerblox/download.py
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
def download_single_dataset(
    self, filename: str, dest_path: str
):
    """
    Download one of the available datasets through SignalsAPI.

    :param filename: Name as listed in NumerAPI (Check NumerAPI().list_datasets() for full overview)
    :param dest_path: Full path where file will be saved.
    """
    print(
        f"Downloading '{filename}'."
    )
    self.sapi.download_dataset(
        filename=filename,
        dest_path=dest_path,
    )

download_training_data(subfolder='', version='1.0')

Get Numerai Signals training and validation data. :param subfolder: Specify folder to create folder within base directory root. Saves in base directory root by default. :param version: Numerai Signals dataset version. Currently only v1.0 is supported.

Source code in numerblox/download.py
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
def download_training_data(
    self, subfolder: str = "", version: str = "1.0"
):
    """
    Get Numerai Signals training and validation data.
    :param subfolder: Specify folder to create folder within base directory root.
    Saves in base directory root by default.
    :param version: Numerai Signals dataset version.
    Currently only v1.0 is supported.
    """
    self._check_dataset_version(version)
    train_val_files = [f"signals/v{version}/{self.TRAIN_DATASET_NAME}",
                       f"signals/v{version}/{self.VALIDATION_DATASET_NAME}"]
    for file in train_val_files:
        dest_path = self._get_dest_path(subfolder, file)
        self.download_single_dataset(
            filename=file,
            dest_path=dest_path
        )

NumerFrame

Bases: DataFrame

Data structure which extends Pandas DataFrames and allows for additional Numerai specific functionality.

Source code in numerblox/numerframe.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
class NumerFrame(pd.DataFrame):
    """
    Data structure which extends Pandas DataFrames and
    allows for additional Numerai specific functionality.
    """
    _metadata = ["meta", "feature_cols", "target_cols",
                 "prediction_cols", "not_aux_cols", "aux_cols"]

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.meta = AttrDict()
        self.__set_era_col()
        self.__init_meta_attrs()

    @property
    def _constructor(self):
        return NumerFrame

    def __init_meta_attrs(self):
        """ Dynamically track column groups. """
        self.feature_cols = [col for col in self.columns if str(col).startswith("feature")]
        self.target_cols = [col for col in self.columns if str(col).startswith("target")]
        self.prediction_cols = [
            col for col in self.columns if str(col).startswith("prediction")
        ]
        self.not_aux_cols = self.feature_cols + self.target_cols + self.prediction_cols
        self.aux_cols = [
            col for col in self.columns if col not in self.not_aux_cols
        ]

    def __set_era_col(self):
        """ Each NumerFrame should have an era column to benefit from all functionality. """
        if "era" in self.columns:
            self.meta.era_col = "era"
        elif "date" in self.columns:
            self.meta.era_col = "date"
        else:
            self.meta.era_col = None

    def get_column_selection(self, cols: Union[str, list]) -> "NumerFrame":
        """ Return NumerFrame from selection of columns. """
        return self.loc[:, cols if isinstance(cols, list) else [cols]]

    @property
    def get_feature_data(self) -> "NumerFrame":
        """ All columns for which name starts with 'target'."""
        return self.get_column_selection(cols=self.feature_cols)

    @property
    def get_target_data(self) -> "NumerFrame":
        """ All columns for which name starts with 'target'."""
        return self.get_column_selection(cols=self.target_cols)

    @property
    def get_single_target_data(self) -> "NumerFrame":
        """ Column with name 'target' (Main Numerai target column). """
        return self.get_column_selection(cols=['target'])

    @property
    def get_prediction_data(self) -> "NumerFrame":
        """ All columns for which name starts with 'prediction'."""
        return self.get_column_selection(cols=self.prediction_cols)

    @property
    def get_aux_data(self) -> "NumerFrame":
        """ All columns that are not features, targets or predictions. """
        return self.get_column_selection(cols=self.aux_cols)

    @property
    def get_era_data(self) -> "NumerFrame":
        """ Column of all eras. """
        return self.get_column_selection(cols=self.meta.era_col)

    @property
    def get_prediction_aux_data(self) -> "NumerFrame":
        """ All predictions columns and aux columns (for ensembling, etc.). """
        return self.get_column_selection(cols=self.prediction_cols+self.aux_cols)

    @property
    def get_fncv3_feature_data(self) -> "NumerFrame":
        """ Get FNCv3 features. """
        return self.get_column_selection(cols=FNCV3_FEATURES)

    @property
    def get_small_feature_data(self) -> "NumerFrame":
        """ Small subset of the Numerai dataset for v4.2 data. """
        return self.get_column_selection(cols=SMALL_FEATURES)

    @property
    def get_medium_feature_data(self) -> "NumerFrame":
        """ Medium subset of the Numerai dataset for v4.2 data. """
        return self.get_column_selection(cols=MEDIUM_FEATURES)

    @property
    def get_v2_equivalent_feature_data(self) -> "NumerFrame":
        """ Features equivalent to the deprecated v2 Numerai data. For v4.2 data. """
        return self.get_column_selection(cols=V2_EQUIVALENT_FEATURES)

    @property
    def get_v3_equivalent_feature_data(self) -> "NumerFrame":
        """ Features equivalent to the deprecated v3 Numerai data. For v4.2 data. """
        return self.get_column_selection(cols=V3_EQUIVALENT_FEATURES)

    @property
    def get_unique_eras(self) -> List[str]:
        """ Get all unique eras in the data. """
        return self[self.meta.era_col].unique().tolist()

    def get_last_n_eras(self, n: int) -> "NumerFrame":
        """ 
        Get data for the last n eras. 
        Make sure eras are sorted in the way you prefer.
        :param n: Number of eras to select.
        :return: NumerFrame with last n eras.
        """
        eras = self[self.meta.era_col].unique()[-n:]
        return self.loc[self[self.meta.era_col].isin(eras)]

    def get_feature_group(self, group: str) -> "NumerFrame":
        """ Get feature group based on name or list of names. """
        assert group in V4_2_FEATURE_GROUP_MAPPING.keys(), \
            f"Group '{group}' not found in {V4_2_FEATURE_GROUP_MAPPING.keys()}"
        return self.get_column_selection(cols=V4_2_FEATURE_GROUP_MAPPING[group])

    def get_pattern_data(self, pattern: str) -> "NumerFrame":
        """
        Get columns based on pattern (for example '_20' to get all 20-day Numerai targets).
        :param pattern: A 'like' pattern (pattern in column_name == True)
        """
        return self.filter(like=pattern)

    def get_feature_target_pair(self, multi_target=False) -> Tuple["NumerFrame", "NumerFrame"]:
        """
        Get split of feature and target columns.
        :param multi_target: Returns only 'target' column by default.
        Returns all target columns when set to True.
        """
        X = self.get_feature_data
        y = self.get_target_data if multi_target else self.get_single_target_data
        return X, y

    def get_era_batch(self, eras: List[Any],
                      convert_to_tf = False,
                      aemlp_batch = False,
                      features: list = None,
                      targets: list = None,
                      *args, **kwargs) -> Tuple["NumerFrame", "NumerFrame"]:
        """
        Get feature target pair batch of 1 or multiple eras. \n
        :param eras: Selection of era names that should be present in era_col. \n
        :param convert_to_tf: Convert to tf.Tensor. \n
        :param aemlp_batch: Specific target batch for autoencoder training. \n
        `y` output will contain three components: features, targets and targets. \n
        :param features: List of features to select. All by default \n
        :param targets: List of targets to select. All by default. \n
        *args, **kwargs are passed to initialization of Tensor.
        """
        valid_eras = []
        for era in eras:
            assert era in self[self.meta.era_col].unique(), f"Era '{era}' not found in era column ({self.meta.era_col})"
            valid_eras.append(era)
        features = features if features else self.feature_cols
        targets = targets if targets else self.target_cols
        X = self.loc[self[self.meta.era_col].isin(valid_eras)][features].values
        y = self.loc[self[self.meta.era_col].isin(valid_eras)][targets].values
        if aemlp_batch:
            y = [X.copy(), y.copy(), y.copy()]

        if convert_to_tf:
            try:
                import tensorflow as tf
            except ImportError:
                raise ImportError("TensorFlow is not installed. Please make sure to have Tensorflow installed when setting `convert_to_tf=True`.")
            X = tf.convert_to_tensor(X, *args, **kwargs)
            if aemlp_batch:
                y = [tf.convert_to_tensor(i, *args, **kwargs) for i in y]
            else:
                y = tf.convert_to_tensor(y, *args, **kwargs)
        return X, y

    @property
    def get_dates_from_era_col(self) -> pd.Series:
        """ Column of all dates from era column. """
        assert "era" in self.columns, \
            "No 'era' column in NumerFrame. Please make sure to have a valid 'era' column to use for converting to dates."
        return self["era"].astype(int).apply(self.get_date_from_era)

    @property
    def get_eras_from_date_col(self) -> pd.Series:
        """ Column of all eras from date column. """
        assert "date" in self.columns, \
            "No 'date' column in NumerFrame. Please make sure to have a valid 'date' column."
        return self["date"].apply(self.get_era_from_date)

    def get_era_range(self, start_era: int, end_era: int) -> "NumerFrame":
        """ 
        Get all eras between two era numbers. 
        :param start_era: Era number to start from (inclusive).
        :param end_era: Era number to end with (inclusive).
        :return: NumerFrame with all eras between start_era and end_era.
        """
        assert "era" in self.columns, "No 'era' column in NumerFrame. Please make sure to have an 'era' column."
        assert isinstance(start_era, int), f"start_era should be of type 'int' but is '{type(start_era)}'"
        assert isinstance(end_era, int), f"end_era should be of type 'int' but is '{type(end_era)}'"
        assert 1 <= start_era <= end_era <= get_current_era(), \
            f"start_era should be between 1 and {get_current_era()}. Got '{start_era}'."
        assert 1 <= start_era <= end_era <= get_current_era(), \
            f"end_era should be between 1 and {get_current_era()}. Got '{end_era}'."
        assert start_era <= end_era, f"start_era should be before end_era. Got '{start_era}' and '{end_era}'"

        temp_df = self.copy()
        temp_df['era_int'] = temp_df['era'].astype(int)
        result_df = temp_df[(temp_df['era_int'] >= start_era) & (temp_df['era_int'] <= end_era)]
        return result_df.drop(columns=['era_int'])

    def get_date_range(self, start_date: pd.Timestamp, end_date: pd.Timestamp) -> "NumerFrame":
        """
        Get all eras between two dates.
        :param start_date: Starting date (inclusive).
        :param end_date: Ending date (inclusive).
        :return: NumerFrame with all eras between start_date and end_date.
        """
        assert "date" in self.columns, \
            "No 'date' column in NumerFrame. Please make sure to have a valid 'date' column."
        assert isinstance(start_date, pd.Timestamp), f"start_date should be of type 'pd.Timestamp' but is '{type(start_date)}'"
        assert isinstance(end_date, pd.Timestamp), f"end_date should be of type 'pd.Timestamp' but is '{type(end_date)}'"
        assert ERA1_TIMESTAMP <= start_date <= pd.Timestamp(get_current_date()), \
            f"start_date should be between {ERA_ONE_START} and {pd.Timestamp(get_current_date())}"
        assert ERA1_TIMESTAMP <= end_date <= pd.Timestamp(get_current_date()), \
            f"end_date should be between {ERA_ONE_START} and {pd.Timestamp(get_current_date())}"
        assert start_date <= end_date, f"start_date should be before end_date. Got '{start_date}' and '{end_date}'"

        temp_df = self.copy()
        result_df = temp_df[(temp_df["date"] >= start_date) & (temp_df["date"] <= end_date)]
        return result_df

    @staticmethod
    def get_era_from_date(date_object: pd.Timestamp) -> int:
        """ 
        Get the era number from a specific date. 
        :param date_object: Pandas Timestamp object for which to get era.
        :return: Era number.
        """
        assert isinstance(date_object, pd.Timestamp), f"date_object should be of type 'date' but is '{type(date_object)}'"
        current_date = pd.Timestamp(get_current_date())
        assert ERA1_TIMESTAMP <= date_object <= current_date, \
            f"date_object should be between {ERA_ONE_START} and {current_date}"
        return get_era_for_date(date_object.date())

    @staticmethod
    def get_date_from_era(era: int) -> pd.Timestamp:
        """ 
        Get the date from a specific era. 
        :param era: Era number for which to get date.
        Should be an integer which is at least 1.
        :return: Datetime object representing the date of the given era.
        """
        assert isinstance(era, int), f"era should be of type 'int' but is '{type(era)}'"
        assert 1 <= era <= get_current_era(), \
            f"era should be between 1 and {get_current_era()}. Got '{era}'."
        return pd.Timestamp(get_date_for_era(era))

get_aux_data: NumerFrame property

All columns that are not features, targets or predictions.

get_dates_from_era_col: pd.Series property

Column of all dates from era column.

get_era_data: NumerFrame property

Column of all eras.

get_eras_from_date_col: pd.Series property

Column of all eras from date column.

get_feature_data: NumerFrame property

All columns for which name starts with 'target'.

get_fncv3_feature_data: NumerFrame property

Get FNCv3 features.

get_medium_feature_data: NumerFrame property

Medium subset of the Numerai dataset for v4.2 data.

get_prediction_aux_data: NumerFrame property

All predictions columns and aux columns (for ensembling, etc.).

get_prediction_data: NumerFrame property

All columns for which name starts with 'prediction'.

get_single_target_data: NumerFrame property

Column with name 'target' (Main Numerai target column).

get_small_feature_data: NumerFrame property

Small subset of the Numerai dataset for v4.2 data.

get_target_data: NumerFrame property

All columns for which name starts with 'target'.

get_unique_eras: List[str] property

Get all unique eras in the data.

get_v2_equivalent_feature_data: NumerFrame property

Features equivalent to the deprecated v2 Numerai data. For v4.2 data.

get_v3_equivalent_feature_data: NumerFrame property

Features equivalent to the deprecated v3 Numerai data. For v4.2 data.

__init_meta_attrs()

Dynamically track column groups.

Source code in numerblox/numerframe.py
34
35
36
37
38
39
40
41
42
43
44
def __init_meta_attrs(self):
    """ Dynamically track column groups. """
    self.feature_cols = [col for col in self.columns if str(col).startswith("feature")]
    self.target_cols = [col for col in self.columns if str(col).startswith("target")]
    self.prediction_cols = [
        col for col in self.columns if str(col).startswith("prediction")
    ]
    self.not_aux_cols = self.feature_cols + self.target_cols + self.prediction_cols
    self.aux_cols = [
        col for col in self.columns if col not in self.not_aux_cols
    ]

__set_era_col()

Each NumerFrame should have an era column to benefit from all functionality.

Source code in numerblox/numerframe.py
46
47
48
49
50
51
52
53
def __set_era_col(self):
    """ Each NumerFrame should have an era column to benefit from all functionality. """
    if "era" in self.columns:
        self.meta.era_col = "era"
    elif "date" in self.columns:
        self.meta.era_col = "date"
    else:
        self.meta.era_col = None

get_column_selection(cols)

Return NumerFrame from selection of columns.

Source code in numerblox/numerframe.py
55
56
57
def get_column_selection(self, cols: Union[str, list]) -> "NumerFrame":
    """ Return NumerFrame from selection of columns. """
    return self.loc[:, cols if isinstance(cols, list) else [cols]]

get_date_from_era(era) staticmethod

Get the date from a specific era. :param era: Era number for which to get date. Should be an integer which is at least 1. :return: Datetime object representing the date of the given era.

Source code in numerblox/numerframe.py
265
266
267
268
269
270
271
272
273
274
275
276
@staticmethod
def get_date_from_era(era: int) -> pd.Timestamp:
    """ 
    Get the date from a specific era. 
    :param era: Era number for which to get date.
    Should be an integer which is at least 1.
    :return: Datetime object representing the date of the given era.
    """
    assert isinstance(era, int), f"era should be of type 'int' but is '{type(era)}'"
    assert 1 <= era <= get_current_era(), \
        f"era should be between 1 and {get_current_era()}. Got '{era}'."
    return pd.Timestamp(get_date_for_era(era))

get_date_range(start_date, end_date)

Get all eras between two dates. :param start_date: Starting date (inclusive). :param end_date: Ending date (inclusive). :return: NumerFrame with all eras between start_date and end_date.

Source code in numerblox/numerframe.py
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
def get_date_range(self, start_date: pd.Timestamp, end_date: pd.Timestamp) -> "NumerFrame":
    """
    Get all eras between two dates.
    :param start_date: Starting date (inclusive).
    :param end_date: Ending date (inclusive).
    :return: NumerFrame with all eras between start_date and end_date.
    """
    assert "date" in self.columns, \
        "No 'date' column in NumerFrame. Please make sure to have a valid 'date' column."
    assert isinstance(start_date, pd.Timestamp), f"start_date should be of type 'pd.Timestamp' but is '{type(start_date)}'"
    assert isinstance(end_date, pd.Timestamp), f"end_date should be of type 'pd.Timestamp' but is '{type(end_date)}'"
    assert ERA1_TIMESTAMP <= start_date <= pd.Timestamp(get_current_date()), \
        f"start_date should be between {ERA_ONE_START} and {pd.Timestamp(get_current_date())}"
    assert ERA1_TIMESTAMP <= end_date <= pd.Timestamp(get_current_date()), \
        f"end_date should be between {ERA_ONE_START} and {pd.Timestamp(get_current_date())}"
    assert start_date <= end_date, f"start_date should be before end_date. Got '{start_date}' and '{end_date}'"

    temp_df = self.copy()
    result_df = temp_df[(temp_df["date"] >= start_date) & (temp_df["date"] <= end_date)]
    return result_df

get_era_batch(eras, convert_to_tf=False, aemlp_batch=False, features=None, targets=None, *args, **kwargs)

Get feature target pair batch of 1 or multiple eras.

:param eras: Selection of era names that should be present in era_col.

:param convert_to_tf: Convert to tf.Tensor.

:param aemlp_batch: Specific target batch for autoencoder training.

y output will contain three components: features, targets and targets.

:param features: List of features to select. All by default

:param targets: List of targets to select. All by default.

args, *kwargs are passed to initialization of Tensor.

Source code in numerblox/numerframe.py
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
def get_era_batch(self, eras: List[Any],
                  convert_to_tf = False,
                  aemlp_batch = False,
                  features: list = None,
                  targets: list = None,
                  *args, **kwargs) -> Tuple["NumerFrame", "NumerFrame"]:
    """
    Get feature target pair batch of 1 or multiple eras. \n
    :param eras: Selection of era names that should be present in era_col. \n
    :param convert_to_tf: Convert to tf.Tensor. \n
    :param aemlp_batch: Specific target batch for autoencoder training. \n
    `y` output will contain three components: features, targets and targets. \n
    :param features: List of features to select. All by default \n
    :param targets: List of targets to select. All by default. \n
    *args, **kwargs are passed to initialization of Tensor.
    """
    valid_eras = []
    for era in eras:
        assert era in self[self.meta.era_col].unique(), f"Era '{era}' not found in era column ({self.meta.era_col})"
        valid_eras.append(era)
    features = features if features else self.feature_cols
    targets = targets if targets else self.target_cols
    X = self.loc[self[self.meta.era_col].isin(valid_eras)][features].values
    y = self.loc[self[self.meta.era_col].isin(valid_eras)][targets].values
    if aemlp_batch:
        y = [X.copy(), y.copy(), y.copy()]

    if convert_to_tf:
        try:
            import tensorflow as tf
        except ImportError:
            raise ImportError("TensorFlow is not installed. Please make sure to have Tensorflow installed when setting `convert_to_tf=True`.")
        X = tf.convert_to_tensor(X, *args, **kwargs)
        if aemlp_batch:
            y = [tf.convert_to_tensor(i, *args, **kwargs) for i in y]
        else:
            y = tf.convert_to_tensor(y, *args, **kwargs)
    return X, y

get_era_from_date(date_object) staticmethod

Get the era number from a specific date. :param date_object: Pandas Timestamp object for which to get era. :return: Era number.

Source code in numerblox/numerframe.py
252
253
254
255
256
257
258
259
260
261
262
263
@staticmethod
def get_era_from_date(date_object: pd.Timestamp) -> int:
    """ 
    Get the era number from a specific date. 
    :param date_object: Pandas Timestamp object for which to get era.
    :return: Era number.
    """
    assert isinstance(date_object, pd.Timestamp), f"date_object should be of type 'date' but is '{type(date_object)}'"
    current_date = pd.Timestamp(get_current_date())
    assert ERA1_TIMESTAMP <= date_object <= current_date, \
        f"date_object should be between {ERA_ONE_START} and {current_date}"
    return get_era_for_date(date_object.date())

get_era_range(start_era, end_era)

Get all eras between two era numbers. :param start_era: Era number to start from (inclusive). :param end_era: Era number to end with (inclusive). :return: NumerFrame with all eras between start_era and end_era.

Source code in numerblox/numerframe.py
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
def get_era_range(self, start_era: int, end_era: int) -> "NumerFrame":
    """ 
    Get all eras between two era numbers. 
    :param start_era: Era number to start from (inclusive).
    :param end_era: Era number to end with (inclusive).
    :return: NumerFrame with all eras between start_era and end_era.
    """
    assert "era" in self.columns, "No 'era' column in NumerFrame. Please make sure to have an 'era' column."
    assert isinstance(start_era, int), f"start_era should be of type 'int' but is '{type(start_era)}'"
    assert isinstance(end_era, int), f"end_era should be of type 'int' but is '{type(end_era)}'"
    assert 1 <= start_era <= end_era <= get_current_era(), \
        f"start_era should be between 1 and {get_current_era()}. Got '{start_era}'."
    assert 1 <= start_era <= end_era <= get_current_era(), \
        f"end_era should be between 1 and {get_current_era()}. Got '{end_era}'."
    assert start_era <= end_era, f"start_era should be before end_era. Got '{start_era}' and '{end_era}'"

    temp_df = self.copy()
    temp_df['era_int'] = temp_df['era'].astype(int)
    result_df = temp_df[(temp_df['era_int'] >= start_era) & (temp_df['era_int'] <= end_era)]
    return result_df.drop(columns=['era_int'])

get_feature_group(group)

Get feature group based on name or list of names.

Source code in numerblox/numerframe.py
134
135
136
137
138
def get_feature_group(self, group: str) -> "NumerFrame":
    """ Get feature group based on name or list of names. """
    assert group in V4_2_FEATURE_GROUP_MAPPING.keys(), \
        f"Group '{group}' not found in {V4_2_FEATURE_GROUP_MAPPING.keys()}"
    return self.get_column_selection(cols=V4_2_FEATURE_GROUP_MAPPING[group])

get_feature_target_pair(multi_target=False)

Get split of feature and target columns. :param multi_target: Returns only 'target' column by default. Returns all target columns when set to True.

Source code in numerblox/numerframe.py
147
148
149
150
151
152
153
154
155
def get_feature_target_pair(self, multi_target=False) -> Tuple["NumerFrame", "NumerFrame"]:
    """
    Get split of feature and target columns.
    :param multi_target: Returns only 'target' column by default.
    Returns all target columns when set to True.
    """
    X = self.get_feature_data
    y = self.get_target_data if multi_target else self.get_single_target_data
    return X, y

get_last_n_eras(n)

Get data for the last n eras. Make sure eras are sorted in the way you prefer. :param n: Number of eras to select. :return: NumerFrame with last n eras.

Source code in numerblox/numerframe.py
124
125
126
127
128
129
130
131
132
def get_last_n_eras(self, n: int) -> "NumerFrame":
    """ 
    Get data for the last n eras. 
    Make sure eras are sorted in the way you prefer.
    :param n: Number of eras to select.
    :return: NumerFrame with last n eras.
    """
    eras = self[self.meta.era_col].unique()[-n:]
    return self.loc[self[self.meta.era_col].isin(eras)]

get_pattern_data(pattern)

Get columns based on pattern (for example '_20' to get all 20-day Numerai targets). :param pattern: A 'like' pattern (pattern in column_name == True)

Source code in numerblox/numerframe.py
140
141
142
143
144
145
def get_pattern_data(self, pattern: str) -> "NumerFrame":
    """
    Get columns based on pattern (for example '_20' to get all 20-day Numerai targets).
    :param pattern: A 'like' pattern (pattern in column_name == True)
    """
    return self.filter(like=pattern)

create_numerframe(file_path, columns=None, *args, **kwargs)

Convenient function to initialize NumerFrame. Support most used file formats for Pandas DataFrames

(.csv, .parquet, .xls, .pkl, etc.). For more details check https://pandas.pydata.org/docs/reference/io.html

:param file_path: Relative or absolute path to data file.

:param columns: Which columns to read (All by default).

args, *kwargs will be passed to Pandas loading function.

Source code in numerblox/numerframe.py
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
def create_numerframe(file_path: str, columns: list = None, *args, **kwargs) -> NumerFrame:
    """
    Convenient function to initialize NumerFrame.
    Support most used file formats for Pandas DataFrames \n
    (.csv, .parquet, .xls, .pkl, etc.).
    For more details check https://pandas.pydata.org/docs/reference/io.html

    :param file_path: Relative or absolute path to data file. \n
    :param columns: Which columns to read (All by default). \n
    *args, **kwargs will be passed to Pandas loading function.
    """
    assert Path(file_path).is_file(), f"{file_path} does not point to file."
    suffix = Path(file_path).suffix
    if suffix in [".csv"]:
        df = pd.read_csv(file_path, usecols=columns, *args, **kwargs)
    elif suffix in [".parquet"]:
        df = pd.read_parquet(file_path, columns=columns, *args, **kwargs)
    elif suffix in [".xls", ".xlsx", ".xlsm", "xlsb", ".odf", ".ods", ".odt"]:
        df = pd.read_excel(file_path, usecols=columns, *args, **kwargs)
    elif suffix in ['.pkl', '.pickle']:
        df = pd.read_pickle(file_path, *args, **kwargs)
        df = df.loc[:, columns] if columns else df
    else:
        raise NotImplementedError(f"Suffix '{suffix}' is not supported.")
    num_frame = NumerFrame(df)
    return num_frame

BasePreProcessor

Bases: BaseEstimator, TransformerMixin

Common functionality for preprocessors and postprocessors.

Source code in numerblox/preprocessing/base.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
class BasePreProcessor(BaseEstimator, TransformerMixin):
    """Common functionality for preprocessors and postprocessors."""

    def __init__(self):
        sklearn.set_config(enable_metadata_routing=True)

    def fit(self, X, y=None):
        self.is_fitted_ = True
        return self

    @abstractmethod
    def transform(
        self, X: Union[np.array, pd.DataFrame], y=None, **kwargs
    ) -> pd.DataFrame:
        ...

    @abstractmethod
    def get_feature_names_out(self, input_features=None) -> List[str]:
        ...

GroupStatsPreProcessor

Bases: BasePreProcessor

WARNING: Only supported for v4.2 (Rain) data. The Rain dataset (re)introduced feature groups.

Note that this class only works with pd.DataFrame input. We using in a Pipeline, make sure that the Pandas output API is set (.set_output(transform="pandas").

Calculates group statistics for all data groups.

:param groups: Groups to create features for. All groups by default.

Source code in numerblox/preprocessing/classic.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
class GroupStatsPreProcessor(BasePreProcessor):
    """
    WARNING: Only supported for v4.2 (Rain) data. The Rain dataset (re)introduced feature groups. \n
    Note that this class only works with `pd.DataFrame` input.
    We using in a Pipeline, make sure that the Pandas output API is set (`.set_output(transform="pandas")`.

    Calculates group statistics for all data groups. \n
    :param groups: Groups to create features for. All groups by default. \n
    """
    def __init__(self, groups: list = None):
        super().__init__()
        self.all_groups = [
            'intelligence', 
            'charisma', 
            'strength', 
            'dexterity', 
            'constitution', 
            'wisdom', 
            'agility', 
            'serenity', 
            'sunshine', 
            'rain'
        ]
        self.groups = groups 
        self.group_names = groups if self.groups else self.all_groups
        self.feature_group_mapping = V4_2_FEATURE_GROUP_MAPPING

    def transform(self, X: pd.DataFrame) -> np.array:
        """Check validity and add group features."""
        dataf = self._add_group_features(X)
        return dataf.to_numpy()

    def _add_group_features(self, X: pd.DataFrame) -> pd.DataFrame:
        """Mean, standard deviation and skew for each group."""
        dataf = pd.DataFrame()
        for group in self.group_names:
            cols = self.feature_group_mapping[group]
            valid_cols = [col for col in cols if col in X.columns]
            if not valid_cols:
                warnings.warn(f"None of the columns of '{group}' are in the input data. Output will be nans for the group features.")
            elif len(cols) != len(valid_cols):
                warnings.warn(f"Not all columns of '{group}' are in the input data ({len(valid_cols)} < {len(cols)}). Use remaining columns for stats features.")
            dataf.loc[:, f"feature_{group}_mean"] = X[valid_cols].mean(axis=1)
            dataf.loc[:, f"feature_{group}_std"] = X[valid_cols].std(axis=1)
            dataf.loc[:, f"feature_{group}_skew"] = X[valid_cols].skew(axis=1)
        return dataf

    def get_feature_names_out(self, input_features=None) -> List[str]:
        """Return feature names."""
        if not input_features:
            feature_names = []
            for group in self.group_names:
                feature_names.append(f"feature_{group}_mean")
                feature_names.append(f"feature_{group}_std")
                feature_names.append(f"feature_{group}_skew")
        else:
            feature_names = input_features
        return feature_names

get_feature_names_out(input_features=None)

Return feature names.

Source code in numerblox/preprocessing/classic.py
56
57
58
59
60
61
62
63
64
65
66
def get_feature_names_out(self, input_features=None) -> List[str]:
    """Return feature names."""
    if not input_features:
        feature_names = []
        for group in self.group_names:
            feature_names.append(f"feature_{group}_mean")
            feature_names.append(f"feature_{group}_std")
            feature_names.append(f"feature_{group}_skew")
    else:
        feature_names = input_features
    return feature_names

transform(X)

Check validity and add group features.

Source code in numerblox/preprocessing/classic.py
36
37
38
39
def transform(self, X: pd.DataFrame) -> np.array:
    """Check validity and add group features."""
    dataf = self._add_group_features(X)
    return dataf.to_numpy()

DifferencePreProcessor

Bases: BasePreProcessor

Add difference features based on given windows. Run LagPreProcessor first. Usage in Pipeline works only with Pandas API. Run .set_output("pandas") on your pipeline first.

:param windows: All lag windows to process for all features.

:param feature_names: All features for which you want to create differences. All features that also have lags by default.

:param pct_change: Method to calculate differences. If True, will calculate differences with a percentage change. Otherwise calculates a simple difference. Defaults to False

:param abs_diff: Whether to also calculate the absolute value of all differences. Defaults to True

Source code in numerblox/preprocessing/signals.py
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
class DifferencePreProcessor(BasePreProcessor):
    """
    Add difference features based on given windows. Run LagPreProcessor first.
    Usage in Pipeline works only with Pandas API. 
    Run `.set_output("pandas")` on your pipeline first.

    :param windows: All lag windows to process for all features. \n
    :param feature_names: All features for which you want to create differences. All features that also have lags by default. \n
    :param pct_change: Method to calculate differences. If True, will calculate differences with a percentage change. Otherwise calculates a simple difference. Defaults to False \n
    :param abs_diff: Whether to also calculate the absolute value of all differences. Defaults to True \n
    """

    def __init__(
        self,
        windows: list = None,
        pct_diff: bool = False,
        abs_diff: bool = False,
    ):
        super().__init__()
        self.windows = windows if windows else [5, 10, 15, 20]
        self.pct_diff = pct_diff
        self.abs_diff = abs_diff

    def transform(self, X: pd.DataFrame) -> np.array:
        """
        Create difference feature from lag features.
        :param X: DataFrame with lag features.
        NOTE: Make sure only lag features are present in the DataFrame.
        """
        feature_names = X.columns.tolist()
        for col in feature_names:
            assert "_lag" in col, "DifferencePreProcessor expects only lag features. Got feature: '{col}'"
        output_features = []
        for feature in tqdm(feature_names, desc="Difference feature generation"):
            for day in self.windows:
                differenced_values = (
                        (X[feature] / X[feature]) - 1
                        if self.pct_diff
                        else X[feature] - X[feature]
                    )
                X.loc[:, f"{feature}_diff{day}"] = differenced_values
                output_features.append(f"{feature}_diff{day}")
                if self.abs_diff:
                    X.loc[:, f"{feature}_absdiff{day}"] = np.abs(
                            X[f"{feature}_diff{day}"]
                        )
                    output_features.append(f"{feature}_absdiff{day}")
        self.output_features = output_features
        return X[self.output_features].to_numpy()

    def get_feature_names_out(self, input_features=None) -> List[str]:
        return self.output_features if not input_features else input_features

transform(X)

Create difference feature from lag features. :param X: DataFrame with lag features. NOTE: Make sure only lag features are present in the DataFrame.

Source code in numerblox/preprocessing/signals.py
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
def transform(self, X: pd.DataFrame) -> np.array:
    """
    Create difference feature from lag features.
    :param X: DataFrame with lag features.
    NOTE: Make sure only lag features are present in the DataFrame.
    """
    feature_names = X.columns.tolist()
    for col in feature_names:
        assert "_lag" in col, "DifferencePreProcessor expects only lag features. Got feature: '{col}'"
    output_features = []
    for feature in tqdm(feature_names, desc="Difference feature generation"):
        for day in self.windows:
            differenced_values = (
                    (X[feature] / X[feature]) - 1
                    if self.pct_diff
                    else X[feature] - X[feature]
                )
            X.loc[:, f"{feature}_diff{day}"] = differenced_values
            output_features.append(f"{feature}_diff{day}")
            if self.abs_diff:
                X.loc[:, f"{feature}_absdiff{day}"] = np.abs(
                        X[f"{feature}_diff{day}"]
                    )
                output_features.append(f"{feature}_absdiff{day}")
    self.output_features = output_features
    return X[self.output_features].to_numpy()

EraQuantileProcessor

Bases: BasePreProcessor

Transform features into quantiles by era. :param num_quantiles: Number of quantiles to use for quantile transformation. :param random_state: Random state for QuantileTransformer. :param cpu_cores: Number of CPU cores to use for parallel processing.

Source code in numerblox/preprocessing/signals.py
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
class EraQuantileProcessor(BasePreProcessor):
    """
    Transform features into quantiles by era.
    :param num_quantiles: Number of quantiles to use for quantile transformation. 
    :param random_state: Random state for QuantileTransformer. 
    :param cpu_cores: Number of CPU cores to use for parallel processing.  
    """
    def __init__(
        self,
        num_quantiles: int = 50,
        random_state: int = 0,
        cpu_cores: int = -1,
    ):
        super().__init__()
        self.num_quantiles = num_quantiles
        self.random_state = random_state
        self.cpu_cores = cpu_cores
        self.quantiler = QuantileTransformer(
            n_quantiles=self.num_quantiles, random_state=self.random_state
        )
        # Metadata routing
        self.set_transform_request(era_series=True)

    def _quantile_transform(self, group_data: pd.Series) -> pd.Series:
        """
        Process single feature for a single era.
        :param group_data: Data for a single feature and era.
        :return: Quantile transformed data.
        """
        transformed_data = self.quantiler.fit_transform(group_data.to_frame()).ravel()
        return pd.Series(transformed_data, index=group_data.index)

    def transform(
        self, X: Union[np.array, pd.DataFrame],
        era_series: pd.Series = None,
    ) -> np.array:
        """ 
        Quantile all features by era.
        :param X: Array or DataFrame containing features to be quantiled.
        :param era_series: Series containing era information.
        :return: Quantiled features.
        """
        X = pd.DataFrame(X)
        if era_series is None:
            warnings.warn("WARNING: 'era_series' not provided for EraQuantileProcessor! Quantiling will be treated as if 'X' is 1 era of data. Ensure you are not passing multiple eras to EraQuantileProcessor in this way! Not providing 'era_series' is valid for live inference, where only one era is used for quantiling.")
        else:
            assert X.shape[0] == era_series.shape[0], "Input X and era_series must have the same number of rows for quantiling."
        self.features = [col for col in X.columns]
        X.loc[:, "era"] = era_series if era_series is not None else "X"
        date_groups = X.groupby('era', group_keys=False)

        def process_feature(feature):
            group_data = date_groups[feature].apply(lambda x: self._quantile_transform(x))
            return pd.Series(group_data, name=f"{feature}_quantile{self.num_quantiles}")

        output_series_list = Parallel(n_jobs=self.cpu_cores)(
            delayed(process_feature)(feature) for feature in tqdm(self.features, desc=f"Quantiling {len(self.features)} features")
        )
        output_df = pd.concat(output_series_list, axis=1)
        return output_df.to_numpy()

    def fit_transform(self, X: Union[np.array, pd.DataFrame], era_series: pd.Series):
        self.fit(X=X)
        return self.transform(X=X, era_series=era_series)

    def get_feature_names_out(self, input_features=None) -> List[str]:
        """Return feature names."""
        if not input_features:
            feature_names = []
            for feature in self.features:
                feature_names.append(f"{feature}_quantile{self.num_quantiles}")
        else:
            feature_names = input_features
        return feature_names

get_feature_names_out(input_features=None)

Return feature names.

Source code in numerblox/preprocessing/signals.py
317
318
319
320
321
322
323
324
325
def get_feature_names_out(self, input_features=None) -> List[str]:
    """Return feature names."""
    if not input_features:
        feature_names = []
        for feature in self.features:
            feature_names.append(f"{feature}_quantile{self.num_quantiles}")
    else:
        feature_names = input_features
    return feature_names

transform(X, era_series=None)

Quantile all features by era. :param X: Array or DataFrame containing features to be quantiled. :param era_series: Series containing era information. :return: Quantiled features.

Source code in numerblox/preprocessing/signals.py
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
def transform(
    self, X: Union[np.array, pd.DataFrame],
    era_series: pd.Series = None,
) -> np.array:
    """ 
    Quantile all features by era.
    :param X: Array or DataFrame containing features to be quantiled.
    :param era_series: Series containing era information.
    :return: Quantiled features.
    """
    X = pd.DataFrame(X)
    if era_series is None:
        warnings.warn("WARNING: 'era_series' not provided for EraQuantileProcessor! Quantiling will be treated as if 'X' is 1 era of data. Ensure you are not passing multiple eras to EraQuantileProcessor in this way! Not providing 'era_series' is valid for live inference, where only one era is used for quantiling.")
    else:
        assert X.shape[0] == era_series.shape[0], "Input X and era_series must have the same number of rows for quantiling."
    self.features = [col for col in X.columns]
    X.loc[:, "era"] = era_series if era_series is not None else "X"
    date_groups = X.groupby('era', group_keys=False)

    def process_feature(feature):
        group_data = date_groups[feature].apply(lambda x: self._quantile_transform(x))
        return pd.Series(group_data, name=f"{feature}_quantile{self.num_quantiles}")

    output_series_list = Parallel(n_jobs=self.cpu_cores)(
        delayed(process_feature)(feature) for feature in tqdm(self.features, desc=f"Quantiling {len(self.features)} features")
    )
    output_df = pd.concat(output_series_list, axis=1)
    return output_df.to_numpy()

HLOCVAdjuster

Bases: BasePreProcessor

Adjust HLOCV data for splits and dividends based on ratio of unadjusted and adjusted close prices. NOTE: This step only works with DataFrame input. Usage in intermediate steps of a scikit-learn Pipeline works with the Pandas set_output API. i.e. pipeline.set_output(transform="pandas").

Source code in numerblox/preprocessing/signals.py
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
class HLOCVAdjuster(BasePreProcessor):
    """ 
    Adjust HLOCV data for splits and dividends based on ratio of unadjusted and adjusted close prices.
    NOTE: This step only works with DataFrame input. 
    Usage in intermediate steps of a scikit-learn Pipeline works with the Pandas set_output API.
    i.e. pipeline.set_output(transform="pandas").
    """
    def __init__(self, open_col="open", high_col="high", low_col="low", 
                 close_col="close", volume_col="volume", adj_close_col="adjusted_close"):
        super().__init__()
        self.open_col = open_col
        self.high_col = high_col
        self.low_col = low_col
        self.close_col = close_col
        self.volume_col = volume_col
        self.adj_close_col = adj_close_col
        self.adjusted_col_names = [f"adjusted_{self.high_col}", f"adjusted_{self.low_col}",
                                   f"adjusted_{self.open_col}", self.adj_close_col, 
                                   f"adjusted_{self.volume_col}"]

    def fit(self, X: pd.DataFrame, y=None):
        self.ratio_ = X[self.close_col] / X[self.adj_close_col]
        self.is_fitted_ = True
        return self

    def transform(self, X: pd.DataFrame) -> np.array:
        """
        Adjust open, high, low, close and volume for splits and dividends.
        :param X: DataFrame with columns: [high, low, open, close, volume] (HLOCV)
        :return: Array with adjusted HLOCV columns
        """
        X_copy = X.copy()  
        X_copy[f"adjusted_{self.high_col}"] = X[self.high_col] / self.ratio_
        X_copy[f"adjusted_{self.low_col}"] = X[self.low_col] / self.ratio_
        X_copy[f"adjusted_{self.open_col}"] = X[self.open_col] / self.ratio_
        X_copy[f"adjusted_{self.volume_col}"] = X[self.volume_col] * self.ratio_
        return X_copy[self.adjusted_col_names].to_numpy()

    def get_feature_names_out(self, input_features=None) -> List[str]:
        return self.adjusted_col_names

transform(X)

Adjust open, high, low, close and volume for splits and dividends. :param X: DataFrame with columns: [high, low, open, close, volume] (HLOCV) :return: Array with adjusted HLOCV columns

Source code in numerblox/preprocessing/signals.py
573
574
575
576
577
578
579
580
581
582
583
584
def transform(self, X: pd.DataFrame) -> np.array:
    """
    Adjust open, high, low, close and volume for splits and dividends.
    :param X: DataFrame with columns: [high, low, open, close, volume] (HLOCV)
    :return: Array with adjusted HLOCV columns
    """
    X_copy = X.copy()  
    X_copy[f"adjusted_{self.high_col}"] = X[self.high_col] / self.ratio_
    X_copy[f"adjusted_{self.low_col}"] = X[self.low_col] / self.ratio_
    X_copy[f"adjusted_{self.open_col}"] = X[self.open_col] / self.ratio_
    X_copy[f"adjusted_{self.volume_col}"] = X[self.volume_col] * self.ratio_
    return X_copy[self.adjusted_col_names].to_numpy()

KatsuFeatureGenerator

Bases: BasePreProcessor

Effective feature engineering setup based on Katsu's starter notebook. Based on source by Katsu1110: https://www.kaggle.com/code1110/numeraisignals-starter-for-beginners

:param windows: Time interval to apply for window features:

  1. Percentage Rate of change

  2. Volatility

  3. Moving Average gap

:param ticker_col: Columns with tickers to iterate over.

:param close_col: Column name where you have closing price stored. :param num_cores: Number of cores to use for multiprocessing.

:param verbose: Print additional information.

Source code in numerblox/preprocessing/signals.py
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
class KatsuFeatureGenerator(BasePreProcessor):
    """
    Effective feature engineering setup based on Katsu's starter notebook.
    Based on source by Katsu1110: https://www.kaggle.com/code1110/numeraisignals-starter-for-beginners

    :param windows: Time interval to apply for window features: \n
    1. Percentage Rate of change \n
    2. Volatility \n
    3. Moving Average gap \n
    :param ticker_col: Columns with tickers to iterate over. \n
    :param close_col: Column name where you have closing price stored.
    :param num_cores: Number of cores to use for multiprocessing. \n
    :param verbose: Print additional information.
    """

    warnings.filterwarnings("ignore")
    def __init__(
        self,
        windows: list,
        ticker_col: str = "ticker",
        close_col: str = "close",
        num_cores: int = None,
        verbose=True
    ):
        super().__init__()
        self.windows = windows
        self.ticker_col = ticker_col
        self.close_col = close_col
        self.num_cores = num_cores if num_cores else os.cpu_count()
        self.verbose = verbose

    def transform(self, dataf: pd.DataFrame) -> np.array:
        """
        Multiprocessing feature engineering.

        :param dataf: DataFrame with columns: [ticker, date, open, high, low, close, volume] \n
        """
        tickers = dataf.loc[:, self.ticker_col].unique().tolist()
        if self.verbose:
            print(
                f"Feature engineering for {len(tickers)} tickers using {self.num_cores} CPU cores."
            )
        dataf_list = [
            x
            for _, x in tqdm(
                dataf.groupby(self.ticker_col), desc="Generating ticker DataFrames"
            )
        ]
        dataf = self._generate_features(dataf_list=dataf_list)
        output_cols = self.get_feature_names_out()
        return dataf[output_cols].to_numpy()

    def feature_engineering(self, dataf: pd.DataFrame) -> pd.DataFrame:
        """Feature engineering for single ticker."""
        close_series = dataf.loc[:, self.close_col]
        for x in self.windows:
            dataf.loc[
                :, f"feature_{self.close_col}_ROCP_{x}"
            ] = close_series.pct_change(x)

            dataf.loc[:, f"feature_{self.close_col}_VOL_{x}"] = (
                np.log1p(close_series).pct_change().rolling(x).std()
            )

            dataf.loc[:, f"feature_{self.close_col}_MA_gap_{x}"] = (
                close_series / close_series.rolling(x).mean()
            )

        dataf.loc[:, "feature_RSI"] = self._rsi(close_series)
        macd, macd_signal = self._macd(close_series)
        dataf.loc[:, "feature_MACD"] = macd
        dataf.loc[:, "feature_MACD_signal"] = macd_signal
        return dataf

    def _generate_features(self, dataf_list: list) -> pd.DataFrame:
        """Add features for list of ticker DataFrames and concatenate."""
        with Pool(self.num_cores) as p:
            feature_datafs = list(
                tqdm(
                    p.imap(self.feature_engineering, dataf_list),
                    desc="Generating features",
                    total=len(dataf_list),
                )
            )
        return pd.concat(feature_datafs)

    @staticmethod
    def _rsi(close: pd.Series, period: int = 14) -> pd.Series:
        """
        See source https://github.com/peerchemist/finta
        and fix https://www.tradingview.com/wiki/Talk:Relative_Strength_Index_(RSI)
        """
        delta = close.diff()
        up, down = delta.copy(), delta.copy()
        up[up < 0] = 0
        down[down > 0] = 0

        gain = up.ewm(com=(period - 1), min_periods=period).mean()
        loss = down.abs().ewm(com=(period - 1), min_periods=period).mean()

        rs = gain / loss
        return pd.Series(100 - (100 / (1 + rs)))

    def _macd(
        self, close: pd.Series, span1=12, span2=26, span3=9
    ) -> Tuple[pd.Series, pd.Series]:
        """Compute MACD and MACD signal."""
        exp1 = self.__ema1(close, span1)
        exp2 = self.__ema1(close, span2)
        macd = 100 * (exp1 - exp2) / exp2
        signal = self.__ema1(macd, span3)
        return macd, signal

    @staticmethod
    def __ema1(series: pd.Series, span: int) -> pd.Series:
        """Exponential moving average"""
        a = 2 / (span + 1)
        return series.ewm(alpha=a).mean()

    def get_feature_names_out(self, input_features=None) -> List[str]:
        """Return feature names."""
        if not input_features:
            feature_names = []
            for x in self.windows:
                feature_names += [
                    f"feature_{self.close_col}_ROCP_{x}",
                    f"feature_{self.close_col}_VOL_{x}",
                    f"feature_{self.close_col}_MA_gap_{x}",
                ]
            feature_names += [
                "feature_RSI",
                "feature_MACD",
                "feature_MACD_signal",
            ]
        else:
            feature_names = input_features
        return feature_names

__ema1(series, span) staticmethod

Exponential moving average

Source code in numerblox/preprocessing/signals.py
226
227
228
229
230
@staticmethod
def __ema1(series: pd.Series, span: int) -> pd.Series:
    """Exponential moving average"""
    a = 2 / (span + 1)
    return series.ewm(alpha=a).mean()

feature_engineering(dataf)

Feature engineering for single ticker.

Source code in numerblox/preprocessing/signals.py
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
def feature_engineering(self, dataf: pd.DataFrame) -> pd.DataFrame:
    """Feature engineering for single ticker."""
    close_series = dataf.loc[:, self.close_col]
    for x in self.windows:
        dataf.loc[
            :, f"feature_{self.close_col}_ROCP_{x}"
        ] = close_series.pct_change(x)

        dataf.loc[:, f"feature_{self.close_col}_VOL_{x}"] = (
            np.log1p(close_series).pct_change().rolling(x).std()
        )

        dataf.loc[:, f"feature_{self.close_col}_MA_gap_{x}"] = (
            close_series / close_series.rolling(x).mean()
        )

    dataf.loc[:, "feature_RSI"] = self._rsi(close_series)
    macd, macd_signal = self._macd(close_series)
    dataf.loc[:, "feature_MACD"] = macd
    dataf.loc[:, "feature_MACD_signal"] = macd_signal
    return dataf

get_feature_names_out(input_features=None)

Return feature names.

Source code in numerblox/preprocessing/signals.py
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
def get_feature_names_out(self, input_features=None) -> List[str]:
    """Return feature names."""
    if not input_features:
        feature_names = []
        for x in self.windows:
            feature_names += [
                f"feature_{self.close_col}_ROCP_{x}",
                f"feature_{self.close_col}_VOL_{x}",
                f"feature_{self.close_col}_MA_gap_{x}",
            ]
        feature_names += [
            "feature_RSI",
            "feature_MACD",
            "feature_MACD_signal",
        ]
    else:
        feature_names = input_features
    return feature_names

transform(dataf)

Multiprocessing feature engineering.

:param dataf: DataFrame with columns: [ticker, date, open, high, low, close, volume]

Source code in numerblox/preprocessing/signals.py
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
def transform(self, dataf: pd.DataFrame) -> np.array:
    """
    Multiprocessing feature engineering.

    :param dataf: DataFrame with columns: [ticker, date, open, high, low, close, volume] \n
    """
    tickers = dataf.loc[:, self.ticker_col].unique().tolist()
    if self.verbose:
        print(
            f"Feature engineering for {len(tickers)} tickers using {self.num_cores} CPU cores."
        )
    dataf_list = [
        x
        for _, x in tqdm(
            dataf.groupby(self.ticker_col), desc="Generating ticker DataFrames"
        )
    ]
    dataf = self._generate_features(dataf_list=dataf_list)
    output_cols = self.get_feature_names_out()
    return dataf[output_cols].to_numpy()

LagPreProcessor

Bases: BasePreProcessor

Add lag features based on given windows.

:param windows: All lag windows to process for all features.

[5, 10, 15, 20] by default (4 weeks lookback)

Source code in numerblox/preprocessing/signals.py
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
class LagPreProcessor(BasePreProcessor):
    """
    Add lag features based on given windows.

    :param windows: All lag windows to process for all features. \n
    [5, 10, 15, 20] by default (4 weeks lookback) \n
    """

    def __init__(self, windows: list = None,):
        super().__init__()
        self.windows = windows if windows else [5, 10, 15, 20]
        # Metadata routing
        self.set_transform_request(ticker_series=True)

    def transform(self, X: Union[np.array, pd.DataFrame], ticker_series: pd.Series = None) -> np.array:
        if ticker_series is None:
            warnings.warn("WARNING: 'era_series' not provided for LagPreProcessor! Lags will be treated as if 'X' is 1 era of data. Ensure you are not passing multiple eras to LagPreProcessor in this way! Not providing 'era_series' is valid for live inference, where only one era is used for creating lags.")
        else:
            assert X.shape[0] == ticker_series.shape[0], "Input X and ticker_series must have the same number of rows for lag generation."

        X = pd.DataFrame(X)
        feature_cols = X.columns.tolist()
        X["ticker"] = ticker_series if ticker_series is not None else "XXXXXXXXXXXXXXXXXXXXXX"
        ticker_groups = X.groupby("ticker")
        output_features = []
        for feature in tqdm(feature_cols, desc="Lag feature generation"):
            feature_group = ticker_groups[feature]
            for day in self.windows:
                shifted = feature_group.shift(day)
                X.loc[:, f"{feature}_lag{day}"] = shifted
                output_features.append(f"{feature}_lag{day}")
        self.output_features = output_features
        return X[output_features].to_numpy()

    def fit_transform(self, X: Union[np.array, pd.DataFrame], ticker_series: pd.Series):
        self.fit(X=X)
        return self.transform(X=X, ticker_series=ticker_series)

    def get_feature_names_out(self, input_features=None) -> List[str]:
        """Return feature names."""
        return self.output_features if not input_features else input_features

get_feature_names_out(input_features=None)

Return feature names.

Source code in numerblox/preprocessing/signals.py
411
412
413
def get_feature_names_out(self, input_features=None) -> List[str]:
    """Return feature names."""
    return self.output_features if not input_features else input_features

MinimumDataFilter

Bases: BasePreProcessor

Filter dates and tickers based on minimum data requirements. NOTE: This step only works with DataFrame input.

:param min_samples_date: Minimum number of samples per date. Defaults to 200. :param min_samples_ticker: Minimum number of samples per ticker. Defaults to 1200. :param blacklist_tickers: List of tickers to exclude from the dataset. Defaults to None. :param date_col: Column name for date. Defaults to "date". :param ticker_col: Column name for ticker. Defaults to "bloomberg_ticker".

Source code in numerblox/preprocessing/signals.py
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
class MinimumDataFilter(BasePreProcessor):
    """ 
    Filter dates and tickers based on minimum data requirements. 
    NOTE: This step only works with DataFrame input.

    :param min_samples_date: Minimum number of samples per date. Defaults to 200.
    :param min_samples_ticker: Minimum number of samples per ticker. Defaults to 1200.
    :param blacklist_tickers: List of tickers to exclude from the dataset. Defaults to None.
    :param date_col: Column name for date. Defaults to "date".
    :param ticker_col: Column name for ticker. Defaults to "bloomberg_ticker".
    """
    def __init__(self, min_samples_date: int = 200, min_samples_ticker: int = 1200, blacklist_tickers: list = None, date_col="date", ticker_col="bloomberg_ticker"):
        super().__init__()
        self.min_samples_date = min_samples_date
        self.min_samples_ticker = min_samples_ticker
        self.blacklist_tickers = blacklist_tickers
        self.date_col = date_col
        self.ticker_col = ticker_col

    def fit(self, X: pd.DataFrame, y=None):
        self.feature_names_out_ = X.columns.tolist()
        self.is_fitted_ = True
        return self

    def transform(self, X: pd.DataFrame) -> np.array:
        """
        Filter dates and tickers based on minimum data requirements.
        :param X: DataFrame with columns: [ticker_col, date_col, open, high, low, close, volume] (HLOCV)
        :return: Array with filtered DataFrame
        """
        filtered_data = X.groupby(self.date_col).filter(lambda x: len(x) >= self.min_samples_date)
        records_per_ticker = (
            filtered_data.reset_index(drop=False)
            .groupby(self.ticker_col)[self.date_col]
            .nunique()
            .reset_index()
            .sort_values(by=self.date_col)
        )
        tickers_with_records = records_per_ticker.query(f"{self.date_col} >= {self.min_samples_ticker}")[self.ticker_col].values
        filtered_data = filtered_data.loc[filtered_data[self.ticker_col].isin(tickers_with_records)].reset_index(drop=True)

        if self.blacklist_tickers:
            filtered_data = filtered_data.loc[~filtered_data[self.ticker_col].isin(self.blacklist_tickers)]

        return filtered_data.to_numpy()

    def get_feature_names_out(self, input_features=None) -> List[str]:
        check_is_fitted(self)
        return self.feature_names_out_ if not input_features else input_features

transform(X)

Filter dates and tickers based on minimum data requirements. :param X: DataFrame with columns: [ticker_col, date_col, open, high, low, close, volume] (HLOCV) :return: Array with filtered DataFrame

Source code in numerblox/preprocessing/signals.py
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
def transform(self, X: pd.DataFrame) -> np.array:
    """
    Filter dates and tickers based on minimum data requirements.
    :param X: DataFrame with columns: [ticker_col, date_col, open, high, low, close, volume] (HLOCV)
    :return: Array with filtered DataFrame
    """
    filtered_data = X.groupby(self.date_col).filter(lambda x: len(x) >= self.min_samples_date)
    records_per_ticker = (
        filtered_data.reset_index(drop=False)
        .groupby(self.ticker_col)[self.date_col]
        .nunique()
        .reset_index()
        .sort_values(by=self.date_col)
    )
    tickers_with_records = records_per_ticker.query(f"{self.date_col} >= {self.min_samples_ticker}")[self.ticker_col].values
    filtered_data = filtered_data.loc[filtered_data[self.ticker_col].isin(tickers_with_records)].reset_index(drop=True)

    if self.blacklist_tickers:
        filtered_data = filtered_data.loc[~filtered_data[self.ticker_col].isin(self.blacklist_tickers)]

    return filtered_data.to_numpy()

PandasTaFeatureGenerator

Bases: BasePreProcessor

Generate features with pandas-ta. https://github.com/twopirllc/pandas-ta Usage in Pipeline works only with Pandas API. Run .set_output("pandas") on your pipeline first.

:param strategy: Valid Pandas Ta strategy.

For more information on creating a strategy, see:

https://github.com/twopirllc/pandas-ta#pandas-ta-strategy

By default, a strategy with RSI(14) and RSI(60) is used.

:param ticker_col: Column name for grouping by tickers.

:param num_cores: Number of cores to use for multiprocessing.

By default, all available cores are used.

Source code in numerblox/preprocessing/signals.py
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
class PandasTaFeatureGenerator(BasePreProcessor):
    """
    Generate features with pandas-ta.
    https://github.com/twopirllc/pandas-ta
    Usage in Pipeline works only with Pandas API. 
    Run `.set_output("pandas")` on your pipeline first.

    :param strategy: Valid Pandas Ta strategy. \n
    For more information on creating a strategy, see: \n
    https://github.com/twopirllc/pandas-ta#pandas-ta-strategy \n
    By default, a strategy with RSI(14) and RSI(60) is used. \n
    :param ticker_col: Column name for grouping by tickers. \n
    :param num_cores: Number of cores to use for multiprocessing. \n
    By default, all available cores are used. \n
    """
    def __init__(self, 
                 strategy: ta.Strategy = None,
                 ticker_col: str = "ticker",
                 num_cores: int = None,
    ):
        super().__init__()
        self.ticker_col = ticker_col
        self.num_cores = num_cores if num_cores else os.cpu_count()
        standard_strategy = ta.Strategy(name="standard", 
                                        ta=[{"kind": "rsi", "length": 14, "col_names": ("feature_RSI_14")},
                                            {"kind": "rsi", "length": 60, "col_names": ("feature_RSI_60")}])
        self.strategy = strategy if strategy is not None else standard_strategy

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Main feature generation method. \n 
        :param X: DataFrame with columns: [ticker, date, open, high, low, close, volume] \n
        :return: PandasTA features
        """
        initial_features = X.columns.tolist()
        dataf_list = [
            x
            for _, x in tqdm(
                X.groupby(self.ticker_col), desc="Generating ticker DataFrames"
            )
        ]
        X = self._generate_features(dataf_list=dataf_list)
        output_df = X.drop(columns=initial_features)
        self.output_cols = output_df.columns.tolist()
        return output_df

    def _generate_features(self, dataf_list: List[pd.DataFrame]) -> pd.DataFrame:
        """
        Add features for list of ticker DataFrames and concatenate.
        :param dataf_list: List of DataFrames for each ticker.
        :return: Concatenated DataFrame for all full list with features added.
        """
        with Pool(self.num_cores) as p:
            feature_datafs = list(
                tqdm(
                    p.imap(self.add_features, dataf_list),
                    desc="Generating pandas-ta features",
                    total=len(dataf_list),
                )
            )
        return pd.concat(feature_datafs)

    def add_features(self, ticker_df: pd.DataFrame) -> pd.DataFrame:
        """ 
        The TA strategy is applied to the DataFrame here.
        :param ticker_df: DataFrame for a single ticker.
        :return: DataFrame with features added.
        """
        # We use a different multiprocessing engine so shutting off pandas_ta's multiprocessing
        ticker_df.ta.cores = 0
        # Run strategy
        ticker_df.ta.strategy(self.strategy)
        return ticker_df

    def get_feature_names_out(self, input_features=None) -> List[str]:
        return self.output_cols if not input_features else input_features

add_features(ticker_df)

The TA strategy is applied to the DataFrame here. :param ticker_df: DataFrame for a single ticker. :return: DataFrame with features added.

Source code in numerblox/preprocessing/signals.py
532
533
534
535
536
537
538
539
540
541
542
def add_features(self, ticker_df: pd.DataFrame) -> pd.DataFrame:
    """ 
    The TA strategy is applied to the DataFrame here.
    :param ticker_df: DataFrame for a single ticker.
    :return: DataFrame with features added.
    """
    # We use a different multiprocessing engine so shutting off pandas_ta's multiprocessing
    ticker_df.ta.cores = 0
    # Run strategy
    ticker_df.ta.strategy(self.strategy)
    return ticker_df

transform(X)

Main feature generation method.

:param X: DataFrame with columns: [ticker, date, open, high, low, close, volume]

:return: PandasTA features

Source code in numerblox/preprocessing/signals.py
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
    """
    Main feature generation method. \n 
    :param X: DataFrame with columns: [ticker, date, open, high, low, close, volume] \n
    :return: PandasTA features
    """
    initial_features = X.columns.tolist()
    dataf_list = [
        x
        for _, x in tqdm(
            X.groupby(self.ticker_col), desc="Generating ticker DataFrames"
        )
    ]
    X = self._generate_features(dataf_list=dataf_list)
    output_df = X.drop(columns=initial_features)
    self.output_cols = output_df.columns.tolist()
    return output_df

ReduceMemoryProcessor

Bases: BasePreProcessor

Reduce memory usage as much as possible.

Credits to kainsama and others for writing about memory usage reduction for Numerai data: https://forum.numer.ai/t/reducing-memory/313

:param deep_mem_inspect: Introspect the data deeply by interrogating object dtypes. Yields a more accurate representation of memory usage if you have complex object columns. :param verbose: Print memory usage before and after optimization.

Source code in numerblox/preprocessing/signals.py
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
class ReduceMemoryProcessor(BasePreProcessor):
    """
    Reduce memory usage as much as possible.

    Credits to kainsama and others for writing about memory usage reduction for Numerai data:
    https://forum.numer.ai/t/reducing-memory/313

    :param deep_mem_inspect: Introspect the data deeply by interrogating object dtypes.
    Yields a more accurate representation of memory usage if you have complex object columns.
    :param verbose: Print memory usage before and after optimization.
    """

    def __init__(self, deep_mem_inspect=False, verbose=True):
        super().__init__()
        self.deep_mem_inspect = deep_mem_inspect
        self.verbose = verbose

    def transform(self, dataf: Union[np.array, pd.DataFrame]) -> np.array:
        return self._reduce_mem_usage(dataf).to_numpy()

    def _reduce_mem_usage(self, dataf: Union[np.array, pd.DataFrame]) -> pd.DataFrame:
        """
        Iterate through all columns and modify the numeric column types
        to reduce memory usage.
        """
        dataf = pd.DataFrame(dataf)
        self.output_cols = dataf.columns.tolist()
        start_memory_usage = (
            dataf.memory_usage(deep=self.deep_mem_inspect).sum() / 1024**2
        )
        if self.verbose:
            print(
                f"Memory usage of DataFrame is {round(start_memory_usage, 2)} MB"
            )

        for col in dataf.columns:
            col_type = dataf[col].dtype.name

            if col_type not in [
                "object",
                "category",
                "datetime64[ns, UTC]",
                "datetime64[ns]",
            ]:
                c_min = dataf[col].min()
                c_max = dataf[col].max()
                if str(col_type)[:3] == "int":
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        dataf[col] = dataf[col].astype(np.int16)
                    elif (
                        c_min > np.iinfo(np.int16).min
                        and c_max < np.iinfo(np.int16).max
                    ):
                        dataf[col] = dataf[col].astype(np.int16)
                    elif (
                        c_min > np.iinfo(np.int32).min
                        and c_max < np.iinfo(np.int32).max
                    ):
                        dataf[col] = dataf[col].astype(np.int32)
                    elif (
                        c_min > np.iinfo(np.int64).min
                        and c_max < np.iinfo(np.int64).max
                    ):
                        dataf[col] = dataf[col].astype(np.int64)
                else:
                    if (
                        c_min > np.finfo(np.float16).min
                        and c_max < np.finfo(np.float16).max
                    ):
                        dataf[col] = dataf[col].astype(np.float16)
                    elif (
                        c_min > np.finfo(np.float32).min
                        and c_max < np.finfo(np.float32).max
                    ):
                        dataf[col] = dataf[col].astype(np.float32)
                    else:
                        dataf[col] = dataf[col].astype(np.float64)

        end_memory_usage = (
            dataf.memory_usage(deep=self.deep_mem_inspect).sum() / 1024**2
        )
        if self.verbose:
            print(
                f"Memory usage after optimization is: {round(end_memory_usage, 2)} MB"
            )
            print(
                f"Usage decreased by {round(100 * (start_memory_usage - end_memory_usage) / start_memory_usage, 2)}%"
            )
        return dataf

    def get_feature_names_out(self, input_features=None) -> List[str]:
        """Return feature names."""
        return self.output_cols if not input_features else input_features

get_feature_names_out(input_features=None)

Return feature names.

Source code in numerblox/preprocessing/signals.py
108
109
110
def get_feature_names_out(self, input_features=None) -> List[str]:
    """Return feature names."""
    return self.output_cols if not input_features else input_features

TickerMapper

Bases: BasePreProcessor

Map ticker from one format to another.

:param ticker_col: Column used for mapping. Must already be present in the input data.

:param target_ticker_format: Format to map tickers to. Must be present in the ticker map.

For default mapper supported ticker formats are: ['ticker', 'bloomberg_ticker', 'yahoo']

:param mapper_path: Path to CSV file containing at least ticker_col and target_ticker_format columns.

Can be either a web link of local path. Numerai Signals mapping by default.

Source code in numerblox/preprocessing/signals.py
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
class TickerMapper(BasePreProcessor):
    """
    Map ticker from one format to another. \n
    :param ticker_col: Column used for mapping. Must already be present in the input data. \n
    :param target_ticker_format: Format to map tickers to. Must be present in the ticker map. \n
    For default mapper supported ticker formats are: ['ticker', 'bloomberg_ticker', 'yahoo'] \n
    :param mapper_path: Path to CSV file containing at least ticker_col and target_ticker_format columns. \n
    Can be either a web link of local path. Numerai Signals mapping by default.
    """

    def __init__(
        self, ticker_col: str = "ticker", target_ticker_format: str = "bloomberg_ticker",
        mapper_path: str = "https://numerai-signals-public-data.s3-us-west-2.amazonaws.com/signals_ticker_map_w_bbg.csv"
    ):
        super().__init__()
        self.ticker_col = ticker_col
        self.target_ticker_format = target_ticker_format

        self.signals_map_path = mapper_path
        self.ticker_map = pd.read_csv(self.signals_map_path)

        assert (
            self.ticker_col in self.ticker_map.columns
        ), f"Ticker column '{self.ticker_col}' is not available in ticker mapping."
        assert (
            self.target_ticker_format in self.ticker_map.columns
        ), f"Target ticker column '{self.target_ticker_format}' is not available in ticker mapping."

        self.mapping = dict(
            self.ticker_map[[self.ticker_col, self.target_ticker_format]].values
        )

    def transform(self, X: Union[np.array, pd.Series]) -> np.array:
        """
        Transform ticker column.
        :param X: Ticker column
        :return tickers: Mapped tickers
        """
        tickers = pd.DataFrame(X, columns=[self.ticker_col])[self.ticker_col].map(self.mapping)
        return tickers.to_numpy()

    def get_feature_names_out(self, input_features=None) -> List[str]:
        return [self.target_ticker_format] if not input_features else input_features

transform(X)

Transform ticker column. :param X: Ticker column :return tickers: Mapped tickers

Source code in numerblox/preprocessing/signals.py
360
361
362
363
364
365
366
367
def transform(self, X: Union[np.array, pd.Series]) -> np.array:
    """
    Transform ticker column.
    :param X: Ticker column
    :return tickers: Mapped tickers
    """
    tickers = pd.DataFrame(X, columns=[self.ticker_col])[self.ticker_col].map(self.mapping)
    return tickers.to_numpy()

BaseTargetProcessor

Bases: BaseEstimator, TransformerMixin

Common functionality for preprocessors and postprocessors.

Source code in numerblox/targets.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
class BaseTargetProcessor(BaseEstimator, TransformerMixin):
    """Common functionality for preprocessors and postprocessors."""

    def __init__(self):
        sklearn.set_config(enable_metadata_routing=True)
        self.set_transform_request(era_series=True)

    def fit(self, X, y=None):
        self.is_fitted_ = True
        return self

    @abstractmethod
    def transform(
        self, X: Union[np.array, pd.DataFrame], y=None
    ) -> pd.DataFrame:
        ...

    @abstractmethod
    def get_feature_names_out(self, input_features=None) -> List[str]:
        ...

BayesianGMMTargetProcessor

Bases: BaseTargetProcessor

Generate synthetic (fake) target using a Bayesian Gaussian Mixture model.

Based on Michael Oliver's GitHub Gist implementation:

https://gist.github.com/the-moliver/dcdd2862dc2c78dda600f1b449071c93

:param n_components: Number of components for fitting Bayesian Gaussian Mixture Model.

Source code in numerblox/targets.py
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
class BayesianGMMTargetProcessor(BaseTargetProcessor):
    """
    Generate synthetic (fake) target using a Bayesian Gaussian Mixture model. \n
    Based on Michael Oliver's GitHub Gist implementation: \n
    https://gist.github.com/the-moliver/dcdd2862dc2c78dda600f1b449071c93

    :param n_components: Number of components for fitting Bayesian Gaussian Mixture Model.
    """
    def __init__(
        self,
        n_components: int = 3,
    ):
        super().__init__()
        self.set_fit_request(era_series=True)
        self.n_components = n_components
        self.ridge = Ridge(fit_intercept=False)
        self.bins = [0, 0.05, 0.25, 0.75, 0.95, 1]

    def fit(self, X: pd.DataFrame, y: pd.Series, era_series: pd.Series):
        """
        Fit Bayesian Gaussian Mixture model on coefficients and normalize.
        :param X: DataFrame containing features.
        :param y: Series containing real target.
        :param era_series: Series containing era information.
        """
        bgmm = BayesianGaussianMixture(n_components=self.n_components)
        coefs = self._get_coefs(dataf=X, y=y, era_series=era_series)
        bgmm.fit(coefs)
        # make probability of sampling each component equal to better balance rare regimes
        bgmm.weights_[:] = 1 / self.n_components
        self.bgmm_ = bgmm
        self.is_fitted_ = True
        return self

    def transform(self, X: pd.DataFrame, era_series: pd.Series) -> np.array:
        """
        Main method for generating fake target.
        :param X: DataFrame containing features.
        :param era_series: Series containing era information.
        """
        check_is_fitted(self, "bgmm_")
        assert len(X) == len(era_series), "X and eras must be same length."
        all_eras = era_series.unique().tolist()
        # Scale data between 0 and 1
        X = X.astype(float)
        X /= X.max()
        X -= 0.5
        X.loc[:, 'era'] = era_series

        fake_target = self._generate_target(dataf=X, all_eras=all_eras)
        return fake_target

    def _get_coefs(self, dataf: pd.DataFrame, y: pd.Series, era_series: pd.Series) -> np.ndarray:
        """
        Generate coefficients for BGMM.
        :param dataf: DataFrame containing features.
        :param y: Series containing real target.
        """
        coefs = []
        dataf.loc[:, 'era'] = era_series
        dataf.loc[:, 'target'] = y
        all_eras = dataf['era'].unique().tolist()
        for era in all_eras:
            era_df = dataf[dataf['era'] == era]
            era_y = era_df.loc[:, 'target']
            era_df = era_df.drop(columns=["era", "target"])
            self.ridge.fit(era_df, era_y)
            coefs.append(self.ridge.coef_)
        stacked_coefs = np.vstack(coefs)
        return stacked_coefs

    def _generate_target(
        self, dataf: pd.DataFrame, all_eras: list
    ) -> np.ndarray:
        """Generate fake target using Bayesian Gaussian Mixture model."""
        fake_target = []
        for era in tqdm(all_eras, desc="Generating fake target"):
            features = dataf[dataf['era'] == era]
            features = features.drop(columns=["era", "target"])
            # Sample a set of weights from GMM
            beta, _ = self.bgmm_.sample(1)
            # Create fake continuous target
            fake_targ = features @ beta[0]
            # Bin fake target like real target
            fake_targ = (rankdata(fake_targ) - 0.5) / len(fake_targ)
            fake_targ = (np.digitize(fake_targ, self.bins) - 1) / 4
            fake_target.append(fake_targ)
        return np.concatenate(fake_target)

    def get_feature_names_out(self, input_features=None) -> List[str]:
        """Return feature names."""
        return ["fake_target"] if not input_features else input_features

fit(X, y, era_series)

Fit Bayesian Gaussian Mixture model on coefficients and normalize. :param X: DataFrame containing features. :param y: Series containing real target. :param era_series: Series containing era information.

Source code in numerblox/targets.py
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
def fit(self, X: pd.DataFrame, y: pd.Series, era_series: pd.Series):
    """
    Fit Bayesian Gaussian Mixture model on coefficients and normalize.
    :param X: DataFrame containing features.
    :param y: Series containing real target.
    :param era_series: Series containing era information.
    """
    bgmm = BayesianGaussianMixture(n_components=self.n_components)
    coefs = self._get_coefs(dataf=X, y=y, era_series=era_series)
    bgmm.fit(coefs)
    # make probability of sampling each component equal to better balance rare regimes
    bgmm.weights_[:] = 1 / self.n_components
    self.bgmm_ = bgmm
    self.is_fitted_ = True
    return self

get_feature_names_out(input_features=None)

Return feature names.

Source code in numerblox/targets.py
127
128
129
def get_feature_names_out(self, input_features=None) -> List[str]:
    """Return feature names."""
    return ["fake_target"] if not input_features else input_features

transform(X, era_series)

Main method for generating fake target. :param X: DataFrame containing features. :param era_series: Series containing era information.

Source code in numerblox/targets.py
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
def transform(self, X: pd.DataFrame, era_series: pd.Series) -> np.array:
    """
    Main method for generating fake target.
    :param X: DataFrame containing features.
    :param era_series: Series containing era information.
    """
    check_is_fitted(self, "bgmm_")
    assert len(X) == len(era_series), "X and eras must be same length."
    all_eras = era_series.unique().tolist()
    # Scale data between 0 and 1
    X = X.astype(float)
    X /= X.max()
    X -= 0.5
    X.loc[:, 'era'] = era_series

    fake_target = self._generate_target(dataf=X, all_eras=all_eras)
    return fake_target

SignalsTargetProcessor

Bases: BaseTargetProcessor

Engineer targets for Numerai Signals.

More information on implements Numerai Signals targets:

https://forum.numer.ai/t/decoding-the-signals-target/2501

:param price_col: Column from which target will be derived.

:param windows: Timeframes to use for engineering targets. 10 and 20-day by default.

:param bins: Binning used to create group targets. Nomi binning by default.

:param labels: Scaling for binned target. Must be same length as resulting bins (bins-1). Numerai labels by default.

Source code in numerblox/targets.py
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
class SignalsTargetProcessor(BaseTargetProcessor):
    """
    Engineer targets for Numerai Signals. \n
    More information on implements Numerai Signals targets: \n
    https://forum.numer.ai/t/decoding-the-signals-target/2501

    :param price_col: Column from which target will be derived. \n
    :param windows: Timeframes to use for engineering targets. 10 and 20-day by default. \n
    :param bins: Binning used to create group targets. Nomi binning by default. \n
    :param labels: Scaling for binned target. Must be same length as resulting bins (bins-1). Numerai labels by default.
    """

    def __init__(
        self,
        price_col: str = "close",
        windows: list = None,
        bins: list = None,
        labels: list = None,
    ):
        super().__init__()
        self.price_col = price_col
        self.windows = windows if windows else [10, 20]
        self.bins = bins if bins else [0, 0.05, 0.25, 0.75, 0.95, 1]
        self.labels = labels if labels else [0, 0.25, 0.50, 0.75, 1]

    def transform(self, dataf: pd.DataFrame, era_series: pd.Series) -> np.array:
        for window in tqdm(self.windows, desc="Signals target engineering windows"):
            dataf.loc[:, f"target_{window}d_raw"] = (
                dataf[self.price_col].pct_change(periods=window).shift(-window)
            )
            era_groups = dataf.groupby(era_series)

            dataf.loc[:, f"target_{window}d_rank"] = era_groups[
                f"target_{window}d_raw"
            ].rank(pct=True, method="first")
            dataf.loc[:, f"target_{window}d_group"] = era_groups[
                f"target_{window}d_rank"
            ].transform(
                lambda group: pd.cut(
                    group, bins=self.bins, labels=self.labels, include_lowest=True
                )
            )
        output_cols = self.get_feature_names_out()
        return dataf[output_cols].to_numpy()

    def get_feature_names_out(self, input_features=None) -> List[str]:
        """Return feature names of Signals targets. """
        if not input_features:
            feature_names = []
            for window in self.windows:
                feature_names.append(f"target_{window}d_raw")
                feature_names.append(f"target_{window}d_rank")
                feature_names.append(f"target_{window}d_group")
        else:
            feature_names = input_features
        return feature_names

get_feature_names_out(input_features=None)

Return feature names of Signals targets.

Source code in numerblox/targets.py
177
178
179
180
181
182
183
184
185
186
187
def get_feature_names_out(self, input_features=None) -> List[str]:
    """Return feature names of Signals targets. """
    if not input_features:
        feature_names = []
        for window in self.windows:
            feature_names.append(f"target_{window}d_raw")
            feature_names.append(f"target_{window}d_rank")
            feature_names.append(f"target_{window}d_group")
    else:
        feature_names = input_features
    return feature_names

CrossValEstimator

Bases: BaseEstimator, TransformerMixin

Split your data into multiple folds and fit an estimator on each fold. For transforms predictions are concatenated into a 2D array. :param cv: Cross validation object that follows scikit-learn conventions. :param estimator: Estimator to fit on each fold. :param evaluation_func: Custom evaluation logic that is executed on validation data for each fold. Must accepts as input y_true and y_pred. For example, evaluation_func can handle logging metrics for each fold. Anything that evaluation_func returns is stored in self.eval_results_. :param predict_func: Name of the function that will be used for prediction. Must be one of 'predict', 'predict_proba', 'predict_log_proba'. For example, XGBRegressor has 'predict' and 'predict_proba' functions. :param verbose: Whether to print progress.

Source code in numerblox/meta.py
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
class CrossValEstimator(BaseEstimator, TransformerMixin):
    """
    Split your data into multiple folds and fit an estimator on each fold.
    For transforms predictions are concatenated into a 2D array.
    :param cv: Cross validation object that follows scikit-learn conventions.
    :param estimator: Estimator to fit on each fold.
    :param evaluation_func: Custom evaluation logic that is executed on validation data for each fold. Must accepts as input y_true and y_pred.
    For example, evaluation_func can handle logging metrics for each fold.
    Anything that evaluation_func returns is stored in `self.eval_results_`.
    :param predict_func: Name of the function that will be used for prediction.
    Must be one of 'predict', 'predict_proba', 'predict_log_proba'.
    For example, XGBRegressor has 'predict' and 'predict_proba' functions.
    :param verbose: Whether to print progress.
    """
    def __init__(self, estimator: BaseEstimator, cv: BaseCrossValidator, evaluation_func=None, predict_func="predict", verbose=False):
        sklearn.set_config(enable_metadata_routing=True)
        super().__init__()
        self.cv = cv
        if not hasattr(self.cv, "split") or isinstance(self.cv, str):
            raise ValueError("cv must be a valid sklearn cv object withat least a 'split' function.")
        self.estimator = estimator
        self.estimator_name = estimator.__class__.__name__
        self.evaluation_func = evaluation_func

        if predict_func not in ["predict", "predict_proba", "predict_log_proba"]:
            raise ValueError("predict_func must be 'predict', 'predict_proba', or 'predict_log_proba'.")
        self.predict_func = predict_func
        assert hasattr(self.estimator, self.predict_func), f"Estimator {self.estimator_name} does not have {self.predict_func} function."
        self.verbose = verbose

    def fit(self, X: Union[np.array, pd.DataFrame], y: Union[np.array, pd.Series], **kwargs):
        """ Use cross validation object to fit estimators. """
        self.estimators_ = []
        self.eval_results_ = []
        if isinstance(X, (pd.Series, pd.DataFrame)):
            X = X.reset_index(drop=True).values
        if isinstance(y, (pd.Series, pd.DataFrame)):
            y = y.reset_index(drop=True).values
        for i, (train_idx, val_idx) in tqdm(enumerate(self.cv.split(X, y)), 
                                            desc=f"CrossValEstimator Fitting. Estimator='{self.estimator_name}'", 
                                            total=self.cv.get_n_splits(), 
                                            disable=not self.verbose):
            estimator = clone(self.estimator)
            if self.verbose:
                print(f"Fitting {self.estimator_name} on fold {len(self.estimators_)}")


            estimator.fit(X[train_idx], y[train_idx], **kwargs)

            # Execute custom evaluation logic
            if self.evaluation_func:
                if self.verbose:
                    print(f"Running evaluation on fold {len(self.estimators_)}")

                y_pred = getattr(estimator, self.predict_func)(X[val_idx])
                y_pred = self._postprocess_pred(y_pred)
                eval_fold = self.evaluation_func(y[val_idx], y_pred)
                if self.verbose:
                    print(f"CrossValEstimator (estimator='{self.estimator_name}'): Fold '{i}' evaluation results: '{eval_fold}'")
                self.eval_results_.append(eval_fold)

            self.estimators_.append(estimator)

            # Store output shape by doing inference on 1st sample of training set
            if i == 0:
                sample_prediction = getattr(estimator, self.predict_func)(X[train_idx][:1])
                sample_prediction = self._postprocess_pred(sample_prediction)
                self.output_shape_ = sample_prediction.shape[1:]
                self.multi_output_ = len(y.shape) > 1
                self.n_outputs_per_model_ = np.prod(self.output_shape_).astype(int)
        return self

    def transform(self, X, model_idxs: List[int] = None, **kwargs) -> np.array:
        """ 
        Use cross validation object to transform estimators. 
        :param X: Input data for inference.
        :param y: Target data for inference.
        :param model_idxs: List of indices of models to use for inference. 
        By default, all fitted models are used.
        :param kwargs: Additional arguments to pass to the estimator's predict function.
        """
        check_is_fitted(self)        
        inference_estimators = [self.estimators_[i] for i in model_idxs] if model_idxs else self.estimators_

        # Create an empty array to store predictions
        final_predictions = np.zeros((X.shape[0], len(inference_estimators) * self.n_outputs_per_model_))
        # Iterate through models to get predictions
        for idx, estimator in enumerate(inference_estimators):
            pred = getattr(estimator, self.predict_func)(X, **kwargs)
            pred = self._postprocess_pred(pred)

            # Calculate where to place these predictions in the final array
            start_idx = idx * self.n_outputs_per_model_
            end_idx = (idx + 1) * self.n_outputs_per_model_

            final_predictions[:, start_idx:end_idx] = pred

        return final_predictions

    def predict(self, X, model_idxs: List[int] = None, **kwargs) -> np.array:
        return self.transform(X, model_idxs, **kwargs)

    def get_feature_names_out(self, input_features=None) -> List[str]:
        check_is_fitted(self)
        base_str = f"CrossValEstimator_{self.estimator_name}_{self.predict_func}"
        # Single-output case
        if self.n_outputs_per_model_ == 1:
            feature_names = [f"{base_str}_{i}" for i in range(len(self.estimators_))]
        # Multi-output case
        else:
            feature_names = []
            for i in range(len(self.estimators_)):
                for j in range(self.n_outputs_per_model_):
                    feature_names.append(f"{base_str}_{i}_output_{j}")
        return feature_names

    def _postprocess_pred(self, pred):
        # Make sure predictions are 2D
        if len(pred.shape) == 1:
            pred = pred.reshape(-1, 1)
        return pred

    def __sklearn_is_fitted__(self) -> bool:
        """ Check fitted status. """
        # Must have a fitted estimator for each split.
        return len(self.estimators_) == self.cv.get_n_splits()

__sklearn_is_fitted__()

Check fitted status.

Source code in numerblox/meta.py
198
199
200
201
def __sklearn_is_fitted__(self) -> bool:
    """ Check fitted status. """
    # Must have a fitted estimator for each split.
    return len(self.estimators_) == self.cv.get_n_splits()

fit(X, y, **kwargs)

Use cross validation object to fit estimators.

Source code in numerblox/meta.py
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
def fit(self, X: Union[np.array, pd.DataFrame], y: Union[np.array, pd.Series], **kwargs):
    """ Use cross validation object to fit estimators. """
    self.estimators_ = []
    self.eval_results_ = []
    if isinstance(X, (pd.Series, pd.DataFrame)):
        X = X.reset_index(drop=True).values
    if isinstance(y, (pd.Series, pd.DataFrame)):
        y = y.reset_index(drop=True).values
    for i, (train_idx, val_idx) in tqdm(enumerate(self.cv.split(X, y)), 
                                        desc=f"CrossValEstimator Fitting. Estimator='{self.estimator_name}'", 
                                        total=self.cv.get_n_splits(), 
                                        disable=not self.verbose):
        estimator = clone(self.estimator)
        if self.verbose:
            print(f"Fitting {self.estimator_name} on fold {len(self.estimators_)}")


        estimator.fit(X[train_idx], y[train_idx], **kwargs)

        # Execute custom evaluation logic
        if self.evaluation_func:
            if self.verbose:
                print(f"Running evaluation on fold {len(self.estimators_)}")

            y_pred = getattr(estimator, self.predict_func)(X[val_idx])
            y_pred = self._postprocess_pred(y_pred)
            eval_fold = self.evaluation_func(y[val_idx], y_pred)
            if self.verbose:
                print(f"CrossValEstimator (estimator='{self.estimator_name}'): Fold '{i}' evaluation results: '{eval_fold}'")
            self.eval_results_.append(eval_fold)

        self.estimators_.append(estimator)

        # Store output shape by doing inference on 1st sample of training set
        if i == 0:
            sample_prediction = getattr(estimator, self.predict_func)(X[train_idx][:1])
            sample_prediction = self._postprocess_pred(sample_prediction)
            self.output_shape_ = sample_prediction.shape[1:]
            self.multi_output_ = len(y.shape) > 1
            self.n_outputs_per_model_ = np.prod(self.output_shape_).astype(int)
    return self

transform(X, model_idxs=None, **kwargs)

Use cross validation object to transform estimators. :param X: Input data for inference. :param y: Target data for inference. :param model_idxs: List of indices of models to use for inference. By default, all fitted models are used. :param kwargs: Additional arguments to pass to the estimator's predict function.

Source code in numerblox/meta.py
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
def transform(self, X, model_idxs: List[int] = None, **kwargs) -> np.array:
    """ 
    Use cross validation object to transform estimators. 
    :param X: Input data for inference.
    :param y: Target data for inference.
    :param model_idxs: List of indices of models to use for inference. 
    By default, all fitted models are used.
    :param kwargs: Additional arguments to pass to the estimator's predict function.
    """
    check_is_fitted(self)        
    inference_estimators = [self.estimators_[i] for i in model_idxs] if model_idxs else self.estimators_

    # Create an empty array to store predictions
    final_predictions = np.zeros((X.shape[0], len(inference_estimators) * self.n_outputs_per_model_))
    # Iterate through models to get predictions
    for idx, estimator in enumerate(inference_estimators):
        pred = getattr(estimator, self.predict_func)(X, **kwargs)
        pred = self._postprocess_pred(pred)

        # Calculate where to place these predictions in the final array
        start_idx = idx * self.n_outputs_per_model_
        end_idx = (idx + 1) * self.n_outputs_per_model_

        final_predictions[:, start_idx:end_idx] = pred

    return final_predictions

MetaEstimator

Bases: BaseEstimator, TransformerMixin, MetaEstimatorMixin

Helper for NumeraiPipeline and NumeraiFeatureUnion to use a model as a transformer.

:param estimator: Underlying estimator like XGBoost, Catboost, scikit-learn, etc. :param predict_func: Name of the function that will be used for prediction. Must be one of 'predict', 'predict_proba', 'predict_log_proba'. For example, XGBRegressor has 'predict' and 'predict_proba' functions. :param model_type: "regressor" or "classifier". Used to determine if the estimator is multi output.

Source code in numerblox/meta.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
class MetaEstimator(BaseEstimator, TransformerMixin, MetaEstimatorMixin):
    """
    Helper for NumeraiPipeline and NumeraiFeatureUnion to use a model as a transformer.

    :param estimator: Underlying estimator like XGBoost, Catboost, scikit-learn, etc.
    :param predict_func: Name of the function that will be used for prediction.
    Must be one of 'predict', 'predict_proba', 'predict_log_proba'.
    For example, XGBRegressor has 'predict' and 'predict_proba' functions.
    :param model_type: "regressor" or "classifier". Used to determine if the estimator is multi output.
    """

    def __init__(self, estimator, predict_func="predict", model_type="regressor"):
        sklearn.set_config(enable_metadata_routing=True)
        self.estimator = estimator
        if predict_func not in ["predict", "predict_proba", "predict_log_proba", "transform"]:
            raise ValueError("predict_func must be 'predict', 'predict_proba', 'predict_log_proba' or 'transform'.")
        self.predict_func = predict_func
        assert model_type in ["regressor", "classifier"], "model_type must be 'regressor' or 'classifier'."
        assert hasattr(self.estimator, self.predict_func), f"Estimator {self.estimator.__class__.__name__} does not have {self.predict_func} function."
        self.model_type = model_type
        # predict_proba for classifiers -> multi output
        self.proba_class_ = predict_func == "predict_proba" and model_type == "classifier"

    def fit(self, X: Union[np.array, pd.DataFrame], y, **kwargs):
        """
        Fit underlying estimator and set attributes.
        """
        X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES, multi_output=True)
        # Either multi target or outputs are probabilities
        self.multi_output_ = len(y.shape) > 1 or self.proba_class_
        self.estimator_ = clone(self.estimator)
        self.estimator_.fit(X, y, **kwargs)
        return self

    def transform(self, X: Union[np.array, pd.DataFrame], **kwargs) -> np.array:
        """
        Apply the `predict_func` on the fitted estimator.

        Shape `(X.shape[0], )` if estimator is not multi output and else `(X.shape[0], y.shape[1])`.
        All additional kwargs are passed to the underlying estimator's predict function.
        """
        check_is_fitted(self, "estimator_")
        output = getattr(self.estimator_, self.predict_func)(X, **kwargs)
        return output if self.multi_output_ else output.reshape(-1, 1)

    def predict(self, X: Union[np.array, pd.DataFrame], **kwargs) -> np.array:
        """ 
        For if a MetaEstimator happens to be the last step in the pipeline. Has same behavior as transform.
        """
        return self.transform(X, **kwargs)

    def get_feature_names_out(self, input_features = None) -> List[str]:
        check_is_fitted(self)
        feature_names = [f"{self.estimator.__class__.__name__}_{self.predict_func}_output"]
        return feature_names if not input_features else input_features

fit(X, y, **kwargs)

Fit underlying estimator and set attributes.

Source code in numerblox/meta.py
42
43
44
45
46
47
48
49
50
51
def fit(self, X: Union[np.array, pd.DataFrame], y, **kwargs):
    """
    Fit underlying estimator and set attributes.
    """
    X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES, multi_output=True)
    # Either multi target or outputs are probabilities
    self.multi_output_ = len(y.shape) > 1 or self.proba_class_
    self.estimator_ = clone(self.estimator)
    self.estimator_.fit(X, y, **kwargs)
    return self

predict(X, **kwargs)

For if a MetaEstimator happens to be the last step in the pipeline. Has same behavior as transform.

Source code in numerblox/meta.py
64
65
66
67
68
def predict(self, X: Union[np.array, pd.DataFrame], **kwargs) -> np.array:
    """ 
    For if a MetaEstimator happens to be the last step in the pipeline. Has same behavior as transform.
    """
    return self.transform(X, **kwargs)

transform(X, **kwargs)

Apply the predict_func on the fitted estimator.

Shape (X.shape[0], ) if estimator is not multi output and else (X.shape[0], y.shape[1]). All additional kwargs are passed to the underlying estimator's predict function.

Source code in numerblox/meta.py
53
54
55
56
57
58
59
60
61
62
def transform(self, X: Union[np.array, pd.DataFrame], **kwargs) -> np.array:
    """
    Apply the `predict_func` on the fitted estimator.

    Shape `(X.shape[0], )` if estimator is not multi output and else `(X.shape[0], y.shape[1])`.
    All additional kwargs are passed to the underlying estimator's predict function.
    """
    check_is_fitted(self, "estimator_")
    output = getattr(self.estimator_, self.predict_func)(X, **kwargs)
    return output if self.multi_output_ else output.reshape(-1, 1)

MetaPipeline

Bases: Pipeline

Pipeline which turns all estimators into transformers by wrapping them in MetaEstimator. This allows to have pipeline steps after models. For example, a FeatureNeutralizer after an XGBRegressor.

:param steps: List of (name, transform) tuples (implementing fit/transform) that are chained, in the order in which they are chained, with the last object an instance of BaseNeutralizer. :param memory: Used to cache the fitted transformers of the pipeline. :param verbose: If True, the time elapsed while fitting each step will be printed as it is completed. :param predict_func: Name of the function that will be used for prediction.

Source code in numerblox/meta.py
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
class MetaPipeline(Pipeline):
    """
    Pipeline which turns all estimators into transformers by wrapping them in MetaEstimator.
    This allows to have pipeline steps after models.
    For example, a FeatureNeutralizer after an XGBRegressor.

    :param steps: List of (name, transform) tuples (implementing fit/transform) that are chained, in the order in which they are chained, with the last object an instance of BaseNeutralizer.
    :param memory: Used to cache the fitted transformers of the pipeline.
    :param verbose: If True, the time elapsed while fitting each step will be printed as it is completed.
    :param predict_func: Name of the function that will be used for prediction.
    """
    def __init__(self, steps, memory=None, verbose=False, predict_func="predict"):
        sklearn.set_config(enable_metadata_routing=True)
        self.predict_func = predict_func
        self.modified_steps = self.wrap_estimators_as_transformers(steps)
        self.steps = self.modified_steps
        self.memory = memory
        self.verbose = verbose

    def wrap_estimators_as_transformers(self, steps):
        """
        Converts all estimator steps (except the last step) into transformers by wrapping them in MetaEstimator.
        :param steps: List of (name, transform) tuples specifying the pipeline steps.
        :return: Modified steps with all estimators wrapped as transformers.
        """
        transformed_steps = []
        for i, step_tuple in enumerate(steps):
            is_last_step = i == len(steps) - 1

            if len(step_tuple) == 3:
                name, step, columns = step_tuple
                transformed_steps.append(self._wrap_step(name, step, columns, is_last_step))
            else:
                name, step = step_tuple
                transformed_steps.append(self._wrap_step(name, step, is_last_step=is_last_step))
        return transformed_steps

    def _wrap_step(self, name, step, columns=None, is_last_step=False):
            """ Recursive function to wrap steps """
            # Recursive call
            if isinstance(step, (Pipeline, FeatureUnion, ColumnTransformer)):
                if isinstance(step, Pipeline):
                    transformed = step.__class__(self.wrap_estimators_as_transformers(step.steps))
                elif isinstance(step, FeatureUnion):
                    transformed = FeatureUnion(self.wrap_estimators_as_transformers(step.transformer_list))
                elif isinstance(step, ColumnTransformer):
                    transformed_transformers = self.wrap_estimators_as_transformers(step.transformers)
                    transformed = ColumnTransformer(transformed_transformers)
                return (name, transformed, columns) if columns else (name, transformed)

            # If it's the last step and it doesn't have a transform method, don't wrap it
            if is_last_step and not hasattr(step, 'transform'):
                return (name, step, columns) if columns else (name, step)

            # Wrap estimator that has the predict function but not the transform function
            elif hasattr(step, self.predict_func) and not hasattr(step, 'transform'):
                return (name, MetaEstimator(step, predict_func=self.predict_func))

            return (name, step, columns) if columns else (name, step)

wrap_estimators_as_transformers(steps)

Converts all estimator steps (except the last step) into transformers by wrapping them in MetaEstimator. :param steps: List of (name, transform) tuples specifying the pipeline steps. :return: Modified steps with all estimators wrapped as transformers.

Source code in numerblox/meta.py
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
def wrap_estimators_as_transformers(self, steps):
    """
    Converts all estimator steps (except the last step) into transformers by wrapping them in MetaEstimator.
    :param steps: List of (name, transform) tuples specifying the pipeline steps.
    :return: Modified steps with all estimators wrapped as transformers.
    """
    transformed_steps = []
    for i, step_tuple in enumerate(steps):
        is_last_step = i == len(steps) - 1

        if len(step_tuple) == 3:
            name, step, columns = step_tuple
            transformed_steps.append(self._wrap_step(name, step, columns, is_last_step))
        else:
            name, step = step_tuple
            transformed_steps.append(self._wrap_step(name, step, is_last_step=is_last_step))
    return transformed_steps

make_meta_pipeline(*steps, memory=None, verbose=False)

Convenience function for creating a MetaPipeline. :param steps: List of (name, transform) tuples (implementing fit/transform) that are chained, in the order in which they are chained, with the last object an instance of BaseNeutralizer. :param memory: Used to cache the fitted transformers of the pipeline. :param verbose: If True, the time elapsed while fitting each step will be printed as it is completed.

Source code in numerblox/meta.py
265
266
267
268
269
270
271
272
def make_meta_pipeline(*steps, memory=None, verbose=False) -> MetaPipeline:
    """ 
    Convenience function for creating a MetaPipeline. 
    :param steps: List of (name, transform) tuples (implementing fit/transform) that are chained, in the order in which they are chained, with the last object an instance of BaseNeutralizer.
    :param memory: Used to cache the fitted transformers of the pipeline.
    :param verbose: If True, the time elapsed while fitting each step will be printed as it is completed.
    """
    return MetaPipeline(_name_estimators(steps), memory=memory, verbose=verbose)

BaseNeutralizer

Bases: BaseEstimator, TransformerMixin

Base class for neutralization so it is compatible with scikit-learn. :param new_col_name: Name of new neutralized column.

Source code in numerblox/neutralizers.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
class BaseNeutralizer(BaseEstimator, TransformerMixin):
    """
    Base class for neutralization so it is compatible with scikit-learn.
    :param new_col_name: Name of new neutralized column.
    """
    def __init__(self, new_col_names: list):
        self.new_col_names = new_col_names
        sklearn.set_config(enable_metadata_routing=True)
        self.set_transform_request(features=True, era_series=True)
        self.set_predict_request(features=True, era_series=True)
        super().__init__()

    def fit(self, X=None, y=None):
        return self

    @abstractmethod
    def transform(
        self, X: Union[np.array, pd.DataFrame], 
        features: pd.DataFrame, era_series: pd.Series
    ) -> np.array:
        ...

    def predict(self, X: np.array, features: pd.DataFrame, era_series: Union[np.array, pd.Series] = None) -> np.array:
        """ Convenience function for scikit-learn compatibility. """
        return self.transform(X=X, features=features, era_series=era_series)

    def fit_transform(self, X: np.array, features: pd.DataFrame, era_series: Union[np.array, pd.Series] = None) -> np.array:
        """ 
        Convenience function for scikit-learn compatibility.
        Needed because fit and transform except different arguments here.
        """
        return self.fit().transform(X=X, features=features, era_series=era_series)

    def get_feature_names_out(self, input_features: list = None) -> list:
        """ 
        Get feature names for neutralized output.

        :param input_features: Optional list of input feature names.
        :return: List of feature names for neutralized output.
        """
        return input_features if input_features else self.new_col_names

fit_transform(X, features, era_series=None)

Convenience function for scikit-learn compatibility. Needed because fit and transform except different arguments here.

Source code in numerblox/neutralizers.py
40
41
42
43
44
45
def fit_transform(self, X: np.array, features: pd.DataFrame, era_series: Union[np.array, pd.Series] = None) -> np.array:
    """ 
    Convenience function for scikit-learn compatibility.
    Needed because fit and transform except different arguments here.
    """
    return self.fit().transform(X=X, features=features, era_series=era_series)

get_feature_names_out(input_features=None)

Get feature names for neutralized output.

:param input_features: Optional list of input feature names. :return: List of feature names for neutralized output.

Source code in numerblox/neutralizers.py
47
48
49
50
51
52
53
54
def get_feature_names_out(self, input_features: list = None) -> list:
    """ 
    Get feature names for neutralized output.

    :param input_features: Optional list of input feature names.
    :return: List of feature names for neutralized output.
    """
    return input_features if input_features else self.new_col_names

predict(X, features, era_series=None)

Convenience function for scikit-learn compatibility.

Source code in numerblox/neutralizers.py
36
37
38
def predict(self, X: np.array, features: pd.DataFrame, era_series: Union[np.array, pd.Series] = None) -> np.array:
    """ Convenience function for scikit-learn compatibility. """
    return self.transform(X=X, features=features, era_series=era_series)

FeatureNeutralizer

Bases: BaseNeutralizer

Classic feature neutralization by subtracting a linear model.

:param pred_name: Name of prediction column. For creating the new column name. :param proportion: Number in range [0...1] indicating how much to neutralize. :param suffix: Optional suffix that is added to new column name. :param num_cores: Number of cores to use for parallel processing. By default, all CPU cores are used.

Source code in numerblox/neutralizers.py
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
class FeatureNeutralizer(BaseNeutralizer):
    """
    Classic feature neutralization by subtracting a linear model.

    :param pred_name: Name of prediction column. For creating the new column name. 
    :param proportion: Number in range [0...1] indicating how much to neutralize.
    :param suffix: Optional suffix that is added to new column name.
    :param num_cores: Number of cores to use for parallel processing.
    By default, all CPU cores are used.
    """
    def __init__(
        self,
        pred_name: Union[str, list] = "prediction",
        proportion: Union[float, List[float]] = 0.5,
        suffix: str = None,
        num_cores: int = -1
    ):
        self.pred_name = [pred_name] if isinstance(pred_name, str) else pred_name
        self.proportion = [proportion] if isinstance(proportion, float) else proportion
        assert len(self.pred_name) == len(set(self.pred_name)), "Duplicate 'pred_names' found. Make sure all names are unique."
        assert len(self.proportion) == len(set(self.proportion)), "Duplicate 'proportions' found. Make sure all proportions are unique."
        for prop in self.proportion:
            assert (
                0.0 <= prop <= 1.0
            ), f"'proportion' should be a float in range [0...1]. Got '{prop}'."

        new_col_names = []
        for pred_name in self.pred_name:
            for prop in self.proportion:
                new_col_names.append(
                    f"{pred_name}_neutralized_{prop}_{suffix}" if suffix else f"{pred_name}_neutralized_{prop}"
                )
        super().__init__(new_col_names=new_col_names)
        self.suffix = suffix
        self.num_cores = num_cores

    def transform(self, X: Union[np.array, pd.Series, pd.DataFrame], 
                  features: pd.DataFrame, era_series: Union[np.array, pd.Series] = None) -> np.array:
        """
        Main transform function.
        :param X: Input predictions to neutralize. \n
        :param features: DataFrame with features for neutralization. \n
        :param era_series: Series with era labels for each row in features. \n
        Features, era_series and the prediction column must all have the same length.
        :return: Neutralized predictions NumPy array.
        """
        if era_series is None:
            warnings.warn("WARNING: 'era_series' not provided for neutralization! Neutralization will be treated as if 'X' is 1 era of data. Ensure you are not passing multiple eras to neutralization in this way! Not providing 'era_series' is valid for live inference, where only one era is used to generate predictions.")
        else:
            assert len(X) == len(era_series), "Input predictions must have same length as era_series."

        if features is None:
            raise ValueError("`features` argument must be provided for neutralization.")
        assert len(X) == len(features), "Input predictions must have same length as features."

        df = features.copy()
        if not isinstance(X, np.ndarray):
            X = np.array(X)
        # Ensure X is a 2D array and has the same number of columns as pred_name
        if X.ndim == 1:
            assert len(self.pred_name) == 1, "Only one prediction column found. Please input a 2D array or define one column for 'pred_name'."
            X = X.reshape(-1, 1)
        else:
            assert len(self.pred_name) == X.shape[1], "Number of prediction columns given in X does not match 'pred_name'."
        for i, pred_name in enumerate(self.pred_name):
            df[pred_name] = X[:, i]
        # Treat input as 1 era if era_series is not provided.
        df["era"] = era_series if era_series is not None else "X"

        feature_cols = list(features.columns)
        tasks = [
            delayed(self._process_pred_name)(df, pred_name, proportion, feature_cols)
            for pred_name in tqdm(self.pred_name, desc="Processing feature neutralizations") 
            for proportion in self.proportion
        ]
        neutralized_results = Parallel(n_jobs=self.num_cores)(tasks)
        neutralized_preds = pd.concat(neutralized_results, axis=1).to_numpy()
        return neutralized_preds

    def _process_pred_name(self, df: pd.DataFrame, pred_name: str, proportion: float, feature_cols: List[str]) -> pd.DataFrame:
        """ 
        Process one combination of prediction and proportion.
        :param df: DataFrame with features and predictions.
        :param pred_name: Name of prediction column.
        :param proportion: Proportion to neutralize.
        :param feature_cols: List of feature column names.
        :return: Neutralized predictions.
        Neutralized predictions are scaled to [0...1].
        """
        neutralized_pred = df.groupby("era", group_keys=False).apply(
            lambda x: self.normalize_and_neutralize(x, [pred_name], feature_cols, proportion)
        )
        return pd.DataFrame(MinMaxScaler().fit_transform(neutralized_pred))

    def neutralize(self, dataf: pd.DataFrame, columns: list, by: list, proportion: float) -> pd.DataFrame:
        """ 
        Neutralize on CPU. 
        :param dataf: DataFrame with features and predictions.
        :param columns: List of prediction column names.
        :param by: List of feature column names.
        :param proportion: Proportion to neutralize.
        :return: Neutralized predictions.
        """
        scores = dataf[columns]
        exposures = dataf[by].values
        scores = scores - proportion * self._get_raw_exposures(exposures, scores)
        return scores / scores.std()

    @staticmethod
    def normalize(dataf: pd.DataFrame) -> np.ndarray:
        """ Normalize predictions.
        1. Rank predictions.
        2. Normalize ranks.
        3. Gaussianize ranks.
        :param dataf: DataFrame with predictions.
        :return: Gaussianized rank predictions.
        """
        normalized_ranks = (dataf.rank(method="first") - 0.5) / len(dataf)
        # Gaussianized ranks
        return sp.norm.ppf(normalized_ranks)

    def normalize_and_neutralize(
        self, dataf: pd.DataFrame, columns: list, by: list, proportion: float
    ) -> pd.DataFrame:
        """ 
        Gaussianize predictions and neutralize with one combination of prediction and proportion. 
        :param dataf: DataFrame with features and predictions.
        :param columns: List of prediction column names.
        :param by: List of feature column names.
        :param proportion: Proportion to neutralize.
        :return: Neutralized predictions DataFrame.
        """
        dataf[columns] = self.normalize(dataf[columns])
        dataf[columns] = self.neutralize(dataf, columns, by, proportion)
        return dataf[columns]

    @staticmethod
    def _get_raw_exposures(exposures: np.array, scores: pd.DataFrame) -> np.array:
        """ 
        Get raw feature exposures.
        Make sure predictions are normalized!
        :param exposures: Exposures for each era. 
        :param scores: DataFrame with predictions.
        :return: Raw exposures for each era.
        """
        return exposures.dot(np.linalg.pinv(exposures).dot(scores))   

neutralize(dataf, columns, by, proportion)

Neutralize on CPU. :param dataf: DataFrame with features and predictions. :param columns: List of prediction column names. :param by: List of feature column names. :param proportion: Proportion to neutralize. :return: Neutralized predictions.

Source code in numerblox/neutralizers.py
151
152
153
154
155
156
157
158
159
160
161
162
163
def neutralize(self, dataf: pd.DataFrame, columns: list, by: list, proportion: float) -> pd.DataFrame:
    """ 
    Neutralize on CPU. 
    :param dataf: DataFrame with features and predictions.
    :param columns: List of prediction column names.
    :param by: List of feature column names.
    :param proportion: Proportion to neutralize.
    :return: Neutralized predictions.
    """
    scores = dataf[columns]
    exposures = dataf[by].values
    scores = scores - proportion * self._get_raw_exposures(exposures, scores)
    return scores / scores.std()

normalize(dataf) staticmethod

Normalize predictions. 1. Rank predictions. 2. Normalize ranks. 3. Gaussianize ranks. :param dataf: DataFrame with predictions. :return: Gaussianized rank predictions.

Source code in numerblox/neutralizers.py
165
166
167
168
169
170
171
172
173
174
175
176
@staticmethod
def normalize(dataf: pd.DataFrame) -> np.ndarray:
    """ Normalize predictions.
    1. Rank predictions.
    2. Normalize ranks.
    3. Gaussianize ranks.
    :param dataf: DataFrame with predictions.
    :return: Gaussianized rank predictions.
    """
    normalized_ranks = (dataf.rank(method="first") - 0.5) / len(dataf)
    # Gaussianized ranks
    return sp.norm.ppf(normalized_ranks)

normalize_and_neutralize(dataf, columns, by, proportion)

Gaussianize predictions and neutralize with one combination of prediction and proportion. :param dataf: DataFrame with features and predictions. :param columns: List of prediction column names. :param by: List of feature column names. :param proportion: Proportion to neutralize. :return: Neutralized predictions DataFrame.

Source code in numerblox/neutralizers.py
178
179
180
181
182
183
184
185
186
187
188
189
190
191
def normalize_and_neutralize(
    self, dataf: pd.DataFrame, columns: list, by: list, proportion: float
) -> pd.DataFrame:
    """ 
    Gaussianize predictions and neutralize with one combination of prediction and proportion. 
    :param dataf: DataFrame with features and predictions.
    :param columns: List of prediction column names.
    :param by: List of feature column names.
    :param proportion: Proportion to neutralize.
    :return: Neutralized predictions DataFrame.
    """
    dataf[columns] = self.normalize(dataf[columns])
    dataf[columns] = self.neutralize(dataf, columns, by, proportion)
    return dataf[columns]

transform(X, features, era_series=None)

Main transform function. :param X: Input predictions to neutralize.

:param features: DataFrame with features for neutralization.

:param era_series: Series with era labels for each row in features.

Features, era_series and the prediction column must all have the same length. :return: Neutralized predictions NumPy array.

Source code in numerblox/neutralizers.py
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
def transform(self, X: Union[np.array, pd.Series, pd.DataFrame], 
              features: pd.DataFrame, era_series: Union[np.array, pd.Series] = None) -> np.array:
    """
    Main transform function.
    :param X: Input predictions to neutralize. \n
    :param features: DataFrame with features for neutralization. \n
    :param era_series: Series with era labels for each row in features. \n
    Features, era_series and the prediction column must all have the same length.
    :return: Neutralized predictions NumPy array.
    """
    if era_series is None:
        warnings.warn("WARNING: 'era_series' not provided for neutralization! Neutralization will be treated as if 'X' is 1 era of data. Ensure you are not passing multiple eras to neutralization in this way! Not providing 'era_series' is valid for live inference, where only one era is used to generate predictions.")
    else:
        assert len(X) == len(era_series), "Input predictions must have same length as era_series."

    if features is None:
        raise ValueError("`features` argument must be provided for neutralization.")
    assert len(X) == len(features), "Input predictions must have same length as features."

    df = features.copy()
    if not isinstance(X, np.ndarray):
        X = np.array(X)
    # Ensure X is a 2D array and has the same number of columns as pred_name
    if X.ndim == 1:
        assert len(self.pred_name) == 1, "Only one prediction column found. Please input a 2D array or define one column for 'pred_name'."
        X = X.reshape(-1, 1)
    else:
        assert len(self.pred_name) == X.shape[1], "Number of prediction columns given in X does not match 'pred_name'."
    for i, pred_name in enumerate(self.pred_name):
        df[pred_name] = X[:, i]
    # Treat input as 1 era if era_series is not provided.
    df["era"] = era_series if era_series is not None else "X"

    feature_cols = list(features.columns)
    tasks = [
        delayed(self._process_pred_name)(df, pred_name, proportion, feature_cols)
        for pred_name in tqdm(self.pred_name, desc="Processing feature neutralizations") 
        for proportion in self.proportion
    ]
    neutralized_results = Parallel(n_jobs=self.num_cores)(tasks)
    neutralized_preds = pd.concat(neutralized_results, axis=1).to_numpy()
    return neutralized_preds

BasePenalizer

Bases: BaseEstimator, TransformerMixin

Base class for penalization so it is compatible with scikit-learn. :param new_col_name: Name of new neutralized column.

Source code in numerblox/penalizers.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
class BasePenalizer(BaseEstimator, TransformerMixin):
    """
    Base class for penalization so it is compatible with scikit-learn.
    :param new_col_name: Name of new neutralized column.
    """
    def __init__(self, new_col_name: str):
        sklearn.set_config(enable_metadata_routing=True)
        self.set_transform_request(features=True, era_series=True)
        self.set_predict_request(features=True, era_series=True)
        self.new_col_name = new_col_name
        super().__init__()

    def fit(self, X=None, y=None):
        return self

    @abstractmethod
    def transform(
        self, X: Union[np.array, pd.DataFrame], 
        features: pd.DataFrame, era_series: pd.Series
    ) -> np.array:
        ...

    def predict(self, X: np.array, features: pd.DataFrame, era_series: Union[np.array, pd.Series]) -> np.array:
        """ Convenience function for scikit-learn compatibility. """
        return self.transform(X=X, features=features, era_series=era_series)

    def fit_transform(self, X: np.array, features: pd.DataFrame, era_series: Union[np.array, pd.Series]) -> np.array:
        """ 
        Convenience function for scikit-learn compatibility.
        Needed because fit and transform except different arguments here.
        """
        return self.fit().transform(X=X, features=features, era_series=era_series)

    def get_feature_names_out(self, input_features: list = None) -> list:
        """ 
        Get feature names for neutralized output.

        :param input_features: Optional list of input feature names.
        :return: List of feature names for neutralized output.
        """
        return input_features if input_features else [self.new_col_name]

fit_transform(X, features, era_series)

Convenience function for scikit-learn compatibility. Needed because fit and transform except different arguments here.

Source code in numerblox/penalizers.py
43
44
45
46
47
48
def fit_transform(self, X: np.array, features: pd.DataFrame, era_series: Union[np.array, pd.Series]) -> np.array:
    """ 
    Convenience function for scikit-learn compatibility.
    Needed because fit and transform except different arguments here.
    """
    return self.fit().transform(X=X, features=features, era_series=era_series)

get_feature_names_out(input_features=None)

Get feature names for neutralized output.

:param input_features: Optional list of input feature names. :return: List of feature names for neutralized output.

Source code in numerblox/penalizers.py
50
51
52
53
54
55
56
57
def get_feature_names_out(self, input_features: list = None) -> list:
    """ 
    Get feature names for neutralized output.

    :param input_features: Optional list of input feature names.
    :return: List of feature names for neutralized output.
    """
    return input_features if input_features else [self.new_col_name]

predict(X, features, era_series)

Convenience function for scikit-learn compatibility.

Source code in numerblox/penalizers.py
39
40
41
def predict(self, X: np.array, features: pd.DataFrame, era_series: Union[np.array, pd.Series]) -> np.array:
    """ Convenience function for scikit-learn compatibility. """
    return self.transform(X=X, features=features, era_series=era_series)

FeaturePenalizer

Bases: BasePenalizer

Feature penalization with TensorFlow.

Source (by jrb): https://github.com/jonrtaylor/twitch/blob/master/FE_Clipping_Script.ipynb

Source of first PyTorch implementation (by Michael Oliver / mdo): https://forum.numer.ai/t/model-diagnostics-feature-exposure/899/12

:param max_exposure: Number in range [0...1] indicating how much to reduce max feature exposure to. :param pred_name: Prediction column name. Used for new column name.

:param suffix: Optional suffix that is added to new column name.

Source code in numerblox/penalizers.py
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
class FeaturePenalizer(BasePenalizer):
    """
    Feature penalization with TensorFlow.

    Source (by jrb): https://github.com/jonrtaylor/twitch/blob/master/FE_Clipping_Script.ipynb

    Source of first PyTorch implementation (by Michael Oliver / mdo): https://forum.numer.ai/t/model-diagnostics-feature-exposure/899/12

    :param max_exposure: Number in range [0...1] indicating how much to reduce max feature exposure to.
    :param pred_name: Prediction column name. Used for new column name. \n
    :param suffix: Optional suffix that is added to new column name.
    """
    def __init__(
        self,
        max_exposure: float,
        pred_name: str = "prediction",
        suffix: str = None,
    ):
        self.max_exposure = max_exposure
        self.pred_name = pred_name
        assert (
            0.0 <= max_exposure <= 1.0
        ), f"'max_exposure' should be a float in range [0...1]. Got '{self.max_exposure}'."
        new_col_name = (
            f"{self.pred_name}_penalized_{self.max_exposure}_{suffix}"
            if suffix
            else f"{self.pred_name}_penalized_{self.max_exposure}"
        )
        super().__init__(new_col_name=new_col_name)
        self.suffix = suffix

    def transform(self, X: pd.DataFrame, features: pd.DataFrame, era_series: pd.Series) -> np.array:
        """
        Main transform method.
        :param X: Input predictions to neutralize. 
        :param features: DataFrame with features for neutralization. 
        :param era_series: Series with era labels for each row in features. 
        Features, eras and the prediction column must all have the same length.
        :return: Penalized predictions.
        """
        assert len(X) == len(features), "Input predictions must have same length as features."
        assert len(X) == len(era_series), "Input predictions must have same length as eras."
        df = features.copy()
        df["prediction"] = X
        df["era"] = era_series
        penalized_data = self._reduce_all_exposures(
            dataf=df, column=self.pred_name, neutralizers=list(features.columns)
        )
        return penalized_data

    def _reduce_all_exposures(
        self,
        dataf: pd.DataFrame,
        column: str = "prediction",
        neutralizers: list = None,
        normalize=True,
        gaussianize=True,
    ) -> pd.DataFrame:
        neutralized = []

        for era in tqdm(dataf["era"].unique()):
            dataf_era = dataf[dataf["era"] == era]
            scores = dataf_era[[column]].values
            exposure_values = dataf_era[neutralizers].values

            if normalize:
                scores2 = []
                for x in scores.T:
                    x = (scipy.stats.rankdata(x, method="ordinal") - 0.5) / len(x)
                    if gaussianize:
                        x = scipy.stats.norm.ppf(x)
                    scores2.append(x)
                scores = np.array(scores2)[0]

            scores, _ = self._reduce_exposure(
                scores, exposure_values, len(neutralizers), None
            )

            scores /= tf.math.reduce_std(scores)
            scores -= tf.reduce_min(scores)
            scores /= tf.reduce_max(scores)
            neutralized.append(scores.numpy())

        predictions = pd.DataFrame(
            np.concatenate(neutralized), columns=[column], index=dataf.index
        )
        return predictions

    def _reduce_exposure(self, prediction, features, input_size=50, weights=None):
        model = tf.keras.models.Sequential(
            [
                tf.keras.layers.Input(input_size),
                tf.keras.experimental.LinearModel(use_bias=False),
            ]
        )
        feats = tf.convert_to_tensor(features - 0.5, dtype=tf.float32)
        pred = tf.convert_to_tensor(prediction, dtype=tf.float32)
        if weights is None:
            optimizer = tf.keras.optimizers.Adamax()
            start_exp = self.__exposures(feats, pred[:, None])
            target_exps = tf.clip_by_value(
                start_exp, -self.max_exposure, self.max_exposure
            )
            self._train_loop(model, optimizer, feats, pred, target_exps)
        else:
            model.set_weights(weights)
        return pred[:, None] - model(feats), model.get_weights()

    def _train_loop(self, model, optimizer, feats, pred, target_exps):
        for _ in range(1000000):
            loss, grads = self.__train_loop_body(model, feats, pred, target_exps)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))
            if loss < 1e-7:
                break

    def __train_loop_body(self, model, feats, pred, target_exps):
        with tf.GradientTape() as tape:
            exps = self.__exposures(feats, pred[:, None] - model(feats, training=True))
            loss = tf.reduce_sum(
                tf.nn.relu(tf.nn.relu(exps) - tf.nn.relu(target_exps))
                + tf.nn.relu(tf.nn.relu(-exps) - tf.nn.relu(-target_exps))
            )
        return loss, tape.gradient(loss, model.trainable_variables)

    @staticmethod
    def __exposures(x, y):
        x = x - tf.math.reduce_mean(x, axis=0)
        x = x / tf.norm(x, axis=0)
        y = y - tf.math.reduce_mean(y, axis=0)
        y = y / tf.norm(y, axis=0)
        return tf.matmul(x, y, transpose_a=True)

transform(X, features, era_series)

Main transform method. :param X: Input predictions to neutralize. :param features: DataFrame with features for neutralization. :param era_series: Series with era labels for each row in features. Features, eras and the prediction column must all have the same length. :return: Penalized predictions.

Source code in numerblox/penalizers.py
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
def transform(self, X: pd.DataFrame, features: pd.DataFrame, era_series: pd.Series) -> np.array:
    """
    Main transform method.
    :param X: Input predictions to neutralize. 
    :param features: DataFrame with features for neutralization. 
    :param era_series: Series with era labels for each row in features. 
    Features, eras and the prediction column must all have the same length.
    :return: Penalized predictions.
    """
    assert len(X) == len(features), "Input predictions must have same length as features."
    assert len(X) == len(era_series), "Input predictions must have same length as eras."
    df = features.copy()
    df["prediction"] = X
    df["era"] = era_series
    penalized_data = self._reduce_all_exposures(
        dataf=df, column=self.pred_name, neutralizers=list(features.columns)
    )
    return penalized_data

BasePredictionLoader

Bases: BaseEstimator, TransformerMixin

Shared functionality for all Prediction Loaders.

Source code in numerblox/prediction_loaders.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
class BasePredictionLoader(BaseEstimator, TransformerMixin):
    """ Shared functionality for all Prediction Loaders. """
    def __init__(self):
        ...

    def fit(self, X=None, y=None):
        return self

    @abstractmethod
    def transform(self, X=None, y=None) -> pd.DataFrame:
        """ Return Predictions generated by model. """
        ...

    @abstractmethod
    def get_feature_names_out(self, input_features=None):
        """ Return feature names. """
        ...

get_feature_names_out(input_features=None) abstractmethod

Return feature names.

Source code in numerblox/prediction_loaders.py
22
23
24
25
@abstractmethod
def get_feature_names_out(self, input_features=None):
    """ Return feature names. """
    ...

transform(X=None, y=None) abstractmethod

Return Predictions generated by model.

Source code in numerblox/prediction_loaders.py
17
18
19
20
@abstractmethod
def transform(self, X=None, y=None) -> pd.DataFrame:
    """ Return Predictions generated by model. """
    ...

ExamplePredictions

Bases: BasePredictionLoader

Load example predictions. :param file_name: File to download from NumerAPI. By default this is example predictions for v4.2 data. 'v4.2/live_example_preds.parquet' by default. Example predictions in previous versions: - v4.2. validation examples -> "v4.2/validation_example_preds.parquet" - v4.2. live benchmark models -> "v4.2/live_benchmark_models.parquet" - v4.2. validation benchmark models -> "v4.2/validation_benchmark_models.parquet" :param round_num: Optional round number. Downloads most recent round by default. :param keep_files: Whether to keep downloaded files. By default, files are deleted after the predictions are loaded.

Source code in numerblox/prediction_loaders.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
class ExamplePredictions(BasePredictionLoader):
    """
    Load example predictions.
    :param file_name: File to download from NumerAPI.
    By default this is example predictions for v4.2 data.
    'v4.2/live_example_preds.parquet' by default. 
    Example predictions in previous versions:
    - v4.2. validation examples -> "v4.2/validation_example_preds.parquet"
    - v4.2. live benchmark models -> "v4.2/live_benchmark_models.parquet"
    - v4.2. validation benchmark models -> "v4.2/validation_benchmark_models.parquet"
    :param round_num: Optional round number. Downloads most recent round by default.
    :param keep_files: Whether to keep downloaded files.
    By default, files are deleted after the predictions are loaded.
    """
    def __init__(self, file_name: str = "v4.2/live_example_preds.parquet",
                 round_num: int = None, keep_files: bool = False):
        super().__init__()
        self.file_name = file_name
        self.round_num = round_num
        self.keep_files = keep_files

    def transform(self, X=None, y=None) -> pd.DataFrame:
        """ Return example predictions. """
        self._download_example_preds()
        example_preds = self._load_example_preds()
        if not self.keep_files:
            self.downloader.remove_base_directory()
        return example_preds

    def _download_example_preds(self):
        data_directory = f"example_predictions_loader_{uuid4()}"
        self.downloader = NumeraiClassicDownloader(directory_path=data_directory)
        self.dest_path = f"{str(self.downloader.dir)}/{self.file_name}"
        self.downloader.download_single_dataset(filename=self.file_name,
                                                dest_path=self.dest_path,
                                                round_num=self.round_num)

    def _load_example_preds(self, *args, **kwargs):
        return pd.read_parquet(self.dest_path, *args, **kwargs)

    def get_feature_names_out(self, input_features=None):
        return [Path(self.file_name).with_suffix('').as_posix()] if not input_features else input_features

transform(X=None, y=None)

Return example predictions.

Source code in numerblox/prediction_loaders.py
48
49
50
51
52
53
54
def transform(self, X=None, y=None) -> pd.DataFrame:
    """ Return example predictions. """
    self._download_example_preds()
    example_preds = self._load_example_preds()
    if not self.keep_files:
        self.downloader.remove_base_directory()
    return example_preds

BaseEvaluator

Evaluation functionality that is relevant for both Numerai Classic and Numerai Signals.

Metrics include: - Mean, Standard Deviation and Sharpe (Corrv2) for era returns. - Max drawdown. - Annual Percentage Yield (APY). - Correlation with benchmark predictions. - Max feature exposure: https://forum.numer.ai/t/model-diagnostics-feature-exposure/899. - Feature Neutral Mean, Standard deviation and Sharpe: https://docs.numer.ai/tournament/feature-neutral-correlation. - Smart Sharpe - Exposure Dissimilarity: https://forum.numer.ai/t/true-contribution-details/5128/4. - Autocorrelation (1st order). - Calmar Ratio. - Performance vs. Benchmark predictions. - Mean, Standard Deviation and Sharpe for TB200 (Buy top 200 stocks and sell bottom 200 stocks). - Mean, Standard Deviation and Sharpe for TB500 (Buy top 500 stocks and sell bottom 500 stocks).

:param metrics_list: List of metrics to calculate. Default: FAST_METRICS. :param era_col: Column name pointing to eras. Most commonly "era" for Numerai Classic and "date" for Numerai Signals. :param custom_functions: Additional functions called in evaluation. Check out the NumerBlox docs on evaluation for more info on using custom functions. :param show_detailed_progress_bar: Show detailed progress bar for evaluation of each prediction column.

Note that we calculate the sample standard deviation with ddof=0. It may differ slightly from the standard Pandas calculation, but is consistent with how NumPy computes standard deviation. More info: https://stackoverflow.com/questions/24984178/different-std-in-pandas-vs-numpy

Source code in numerblox/evaluation.py
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
class BaseEvaluator:
    """
    Evaluation functionality that is relevant for both
    Numerai Classic and Numerai Signals.

    Metrics include:
    - Mean, Standard Deviation and Sharpe (Corrv2) for era returns.
    - Max drawdown.
    - Annual Percentage Yield (APY).
    - Correlation with benchmark predictions.
    - Max feature exposure: https://forum.numer.ai/t/model-diagnostics-feature-exposure/899.
    - Feature Neutral Mean, Standard deviation and Sharpe: https://docs.numer.ai/tournament/feature-neutral-correlation.
    - Smart Sharpe
    - Exposure Dissimilarity: https://forum.numer.ai/t/true-contribution-details/5128/4.
    - Autocorrelation (1st order).
    - Calmar Ratio.
    - Performance vs. Benchmark predictions.
    - Mean, Standard Deviation and Sharpe for TB200 (Buy top 200 stocks and sell bottom 200 stocks).
    - Mean, Standard Deviation and Sharpe for TB500 (Buy top 500 stocks and sell bottom 500 stocks).

    :param metrics_list: List of metrics to calculate. Default: FAST_METRICS.
    :param era_col: Column name pointing to eras. Most commonly "era" for Numerai Classic and "date" for Numerai Signals.
    :param custom_functions: Additional functions called in evaluation.
    Check out the NumerBlox docs on evaluation for more info on using custom functions.
    :param show_detailed_progress_bar: Show detailed progress bar for evaluation of each prediction column.

    Note that we calculate the sample standard deviation with ddof=0.
    It may differ slightly from the standard Pandas calculation, but
    is consistent with how NumPy computes standard deviation.
    More info:
    https://stackoverflow.com/questions/24984178/different-std-in-pandas-vs-numpy
    """

    def __init__(
        self,
        metrics_list: List[str],
        era_col: str,
        custom_functions: Dict[str, Dict[str, Any]],
        show_detailed_progress_bar: bool,
    ):
        self.era_col = era_col
        self.metrics_list = metrics_list
        self.custom_functions = custom_functions
        if self.custom_functions is not None:
            self.check_custom_functions()
        self.show_detailed_progress_bar = show_detailed_progress_bar
        sklearn.set_config(enable_metadata_routing=True)

    def full_evaluation(
        self,
        dataf: pd.DataFrame,
        pred_cols: List[str],
        target_col: str = "target",
        benchmark_cols: list = None,
    ) -> pd.DataFrame:
        """
        Perform evaluation for each prediction column in pred_cols.
        By default only the "prediction" column is evaluated.
        Evaluation is done against given target and benchmark prediction column.
        :param dataf: DataFrame containing era_col, pred_cols, target_col and optional benchmark_cols.
        :param pred_cols: List of prediction columns to calculate evaluation metrics for.
        :param target_col: Target column to evaluate against.
        :param benchmark_cols: Optional list of benchmark columns to calculate evaluation metrics for.
        """
        val_stats = pd.DataFrame()
        feature_cols = [col for col in dataf.columns if col.startswith("feature")]
        cat_cols = (
            dataf[feature_cols].select_dtypes(include=["category"]).columns.to_list()
        )
        if cat_cols:
            print(
                f"WARNING: Categorical features detected that cannot be used for neutralization. Removing columns: '{cat_cols}' for evaluation."
            )
            dataf.loc[:, feature_cols] = dataf[feature_cols].select_dtypes(
                exclude=["category"]
            )
        dataf = dataf.fillna(0.5)
        for col in tqdm(pred_cols, desc="Evaluation: "):
            col_stats = self.evaluation_one_col(
                dataf=dataf,
                pred_col=col,
                feature_cols=feature_cols,
                target_col=target_col,
                benchmark_cols=benchmark_cols,
            )
            val_stats = pd.concat([val_stats, col_stats], axis=0)
        return val_stats

    def evaluation_one_col(
        self,
        dataf: pd.DataFrame,
        feature_cols: list,
        pred_col: str,
        target_col: str,
        benchmark_cols: list = None,
    ):
        """
        Perform evaluation for one prediction column
        against given target and benchmark column(s).
        """
        assert (
            self.era_col in dataf.columns
        ), f"Era column '{self.era_col}' not found in DataFrame. Make sure to set the correct era_col."
        assert (
                pred_col in dataf.columns
            ), f"Prediction column '{pred_col}' not found in DataFrame. Make sure to set the correct pred_col."
        assert (
            target_col in dataf.columns
        ), f"Target column '{target_col}' not found in DataFrame. Make sure to set the correct target_col."
        if benchmark_cols:
            for col in benchmark_cols:
                assert (
                    col in dataf.columns
                ), f"Benchmark column '{col}' not found in DataFrame. Make sure to set the correct benchmark_cols."

        # Check that all values are between 0 and 1
        assert (
            dataf[pred_col].min().min() >= 0 and dataf[pred_col].max().max() <= 1
        ), "All predictions should be between 0 and 1 (inclusive)."
        assert (
            dataf[target_col].min() >= 0 and dataf[target_col].max() <= 1
        ), "All targets should be between 0 and 1 (inclusive)."
        if benchmark_cols is not None:
            for col in benchmark_cols:
                assert (
                    dataf[col].min() >= 0 and dataf[col].max() <= 1
                ), f"All predictions for '{col}' should be between 0 and 1 (inclusive)."

        if self.show_detailed_progress_bar:
            len_metrics_list = len(self.metrics_list)
            len_benchmark_cols = 0 if benchmark_cols is None else len(benchmark_cols)
            len_custom_functions = 0 if self.custom_functions is None else len(list(self.custom_functions.keys()))
            len_pbar = len_metrics_list + len_benchmark_cols + len_custom_functions
            pbar = tqdm(total=len_pbar, desc="Evaluation")

        col_stats = {}
        col_stats["target"] = target_col

        # Compute stats per era (only if needed)
        per_era_numerai_corrs = self.per_era_numerai_corrs(
            dataf=dataf, pred_col=pred_col, target_col=target_col
        )

        # check if mean, std, or sharpe are in metrics_list
        if "mean_std_sharpe" in self.metrics_list:
            if self.show_detailed_progress_bar:
                pbar.set_description_str(f"mean_std_sharpe for evaluation")
                pbar.update(1)
            mean, std, sharpe = self.mean_std_sharpe(era_corrs=per_era_numerai_corrs)
            col_stats["mean"] = mean
            col_stats["std"] = std
            col_stats["sharpe"] = sharpe

        if "legacy_mean_std_sharpe" in self.metrics_list:
            if self.show_detailed_progress_bar:
                pbar.set_description_str(f"legacy_mean_std_sharpe for evaluation")
                pbar.update(1)
            per_era_corrs = self.per_era_corrs(
                dataf=dataf, pred_col=pred_col, target_col=target_col
            )
            legacy_mean, legacy_std, legacy_sharpe = self.mean_std_sharpe(
                era_corrs=per_era_corrs
            )
            col_stats["legacy_mean"] = legacy_mean
            col_stats["legacy_std"] = legacy_std
            col_stats["legacy_sharpe"] = legacy_sharpe

        if "max_drawdown" in self.metrics_list:
            if self.show_detailed_progress_bar:
                pbar.set_description_str(f"max_drawdown for evaluation")
                pbar.update(1)
            col_stats["max_drawdown"] = self.max_drawdown(
                era_corrs=per_era_numerai_corrs
            )

        if "apy":
            if self.show_detailed_progress_bar:
                pbar.set_description_str(f"apy for evaluation")
                pbar.update(1)
            col_stats["apy"] = self.apy(era_corrs=per_era_numerai_corrs)

        if "calmar_ratio" in self.metrics_list:
            if self.show_detailed_progress_bar:
                pbar.set_description_str(f"calmar_ratio for evaluation")
                pbar.update(1)
            if not "max_drawdown" in self.metrics_list:
                col_stats["max_drawdown"] = self.max_drawdown(
                    era_corrs=per_era_numerai_corrs
                )
            if not "apy" in self.metrics_list:
                col_stats["apy"] = self.apy(era_corrs=per_era_numerai_corrs)
            col_stats["calmar_ratio"] = (
                np.nan
                if col_stats["max_drawdown"] == 0
                else col_stats["apy"] / -col_stats["max_drawdown"]
            )

        if "autocorrelation" in self.metrics_list:
            if self.show_detailed_progress_bar:
                pbar.set_description(f"autocorrelation for evaluation")
                pbar.update(1)
            col_stats["autocorrelation"] = self.autocorr1(per_era_numerai_corrs)

        if "max_feature_exposure" in self.metrics_list:
            if self.show_detailed_progress_bar:
                pbar.set_description_str(f"max_feature_exposure for evaluation")
                pbar.update(1)
            col_stats["max_feature_exposure"] = self.max_feature_exposure(
                dataf=dataf, feature_cols=feature_cols, pred_col=pred_col
            )

        if "smart_sharpe" in self.metrics_list:
            if self.show_detailed_progress_bar:
                pbar.set_description_str(f"smart_sharpe for evaluation")
                pbar.update(1)
            col_stats["smart_sharpe"] = self.smart_sharpe(
                era_corrs=per_era_numerai_corrs
            )

        if benchmark_cols is not None:
            for bench_col in benchmark_cols:
                if self.show_detailed_progress_bar:
                    pbar.set_description_str(f"Evaluation for benchmark column: '{bench_col}'")
                    pbar.update(1)

                per_era_bench_corrs = self.per_era_numerai_corrs(
                    dataf=dataf, pred_col=bench_col, target_col=target_col
                )

                if "mean_std_sharpe" in self.metrics_list:
                    if self.show_detailed_progress_bar:
                        pbar.set_description_str(f"mean_std_sharpe for benchmark column: '{bench_col}'")
                    bench_mean, bench_std, bench_sharpe = self.mean_std_sharpe(
                        era_corrs=per_era_bench_corrs
                    )
                    col_stats[f"mean_vs_{bench_col}"] = mean - bench_mean
                    col_stats[f"std_vs_{bench_col}"] = std - bench_std
                    col_stats[f"sharpe_vs_{bench_col}"] = sharpe - bench_sharpe

                if "mc_mean_std_sharpe" in self.metrics_list:
                    if self.show_detailed_progress_bar:
                        pbar.set_description_str(f"mc_mean_std_sharpe for benchmark column: '{bench_col}'")
                    mc_scores = self.contributive_correlation(
                        dataf=dataf,
                        pred_col=pred_col,
                        target_col=target_col,
                        other_col=bench_col,
                    )
                    col_stats[f"mc_mean_{bench_col}"] = np.nanmean(mc_scores)
                    col_stats[f"mc_std_{bench_col}"] = np.nanstd(mc_scores)
                    col_stats[f"mc_sharpe_{bench_col}"] = (
                        np.nan
                        if col_stats[f"mc_std_{bench_col}"] == 0
                        else col_stats[f"mc_mean_{bench_col}"]
                        / col_stats[f"mc_std_{bench_col}"]
                    )

                if "corr_with" in self.metrics_list:
                    if self.show_detailed_progress_bar:
                        pbar.set_description_str(f"corr_with for benchmark column: '{bench_col}'")
                    col_stats[f"corr_with_{bench_col}"] = self.cross_correlation(
                        dataf=dataf, pred_col=bench_col, other_col=bench_col
                    )

                if "legacy_mc_mean_std_sharpe" in self.metrics_list:
                    if self.show_detailed_progress_bar:
                        pbar.set_description_str(f"legacy_mc_mean_std_sharpe for benchmark column: '{bench_col}'")
                    legacy_mc_scores = self.legacy_contribution(
                        dataf=dataf,
                        pred_col=pred_col,
                        target_col=target_col,
                        other_col=bench_col,
                    )
                    col_stats[f"legacy_mc_mean_{bench_col}"] = np.nanmean(
                        legacy_mc_scores
                    )
                    col_stats[f"legacy_mc_std_{bench_col}"] = np.nanstd(
                        legacy_mc_scores
                    )
                    col_stats[f"legacy_mc_sharpe_{bench_col}"] = (
                        np.nan
                        if col_stats[f"legacy_mc_std_{bench_col}"] == 0
                        else col_stats[f"legacy_mc_mean_{bench_col}"]
                        / col_stats[f"legacy_mc_std_{bench_col}"]
                    )

                if "ex_diss" in self.metrics_list or "ex_diss_pearson" in self.metrics_list:
                    if self.show_detailed_progress_bar:
                        pbar.set_description_str(f"ex_diss_pearson for benchmark column: '{bench_col}'")
                    col_stats[
                        f"exposure_dissimilarity_pearson_{bench_col}"
                    ] = self.exposure_dissimilarity(
                        dataf=dataf, pred_col=pred_col, other_col=bench_col,
                        corr_method="pearson"
                    )
                if "ex_diss_spearman" in self.metrics_list:
                    if self.show_detailed_progress_bar:
                        pbar.set_description_str(f"ex_diss_spearman for benchmark column: '{bench_col}'")
                    col_stats[
                        f"exposure_dissimilarity_spearman_{bench_col}"
                    ] = self.exposure_dissimilarity(
                        dataf=dataf, pred_col=pred_col, other_col=bench_col,
                        corr_method="spearman"
                    )

        # Compute intensive stats
        if "fn_mean_std_sharpe" in self.metrics_list:
            if self.show_detailed_progress_bar:
                pbar.set_description_str(f"fn_mean_std_sharpe for evaluation")
                pbar.update(1)
            fn_mean, fn_std, fn_sharpe = self.feature_neutral_mean_std_sharpe(
                dataf=dataf,
                pred_col=pred_col,
                target_col=target_col,
                feature_names=feature_cols,
            )
            col_stats["feature_neutral_mean"] = fn_mean
            col_stats["feature_neutral_std"] = fn_std
            col_stats["feature_neutral_sharpe"] = fn_sharpe

        if "tb200_mean_std_sharpe" in self.metrics_list:
            if self.show_detailed_progress_bar:
                pbar.set_description_str(f"tb200_mean_std_sharpe for evaluation")
                pbar.update(1)
            tb200_mean, tb200_std, tb200_sharpe = self.tbx_mean_std_sharpe(
                dataf=dataf, pred_col=pred_col, target_col=target_col, tb=200
            )
            col_stats["tb200_mean"] = tb200_mean
            col_stats["tb200_std"] = tb200_std
            col_stats["tb200_sharpe"] = tb200_sharpe

        if "tb500_mean_std_sharpe" in self.metrics_list:
            if self.show_detailed_progress_bar:
                pbar.set_description_str(f"tb500_mean_std_sharpe for evaluation")
                pbar.update(1)
            tb500_mean, tb500_std, tb500_sharpe = self.tbx_mean_std_sharpe(
                dataf=dataf, pred_col=pred_col, target_col=target_col, tb=500
            )
            col_stats["tb500_mean"] = tb500_mean
            col_stats["tb500_std"] = tb500_std
            col_stats["tb500_sharpe"] = tb500_sharpe

        # Custom functions
        if self.custom_functions is not None:
            local_vars = locals()
            for func_name, func_info in self.custom_functions.items():
                if self.show_detailed_progress_bar:
                    pbar.set_description_str(f"custom function: '{func_name}' for evaluation")
                    pbar.update(1)
                func = func_info['func']
                args = func_info['args']
                local_args = func_info['local_args']
                resolved_args = {}
                for k, v in args.items():
                    # Resolve variables defined as local args
                    if isinstance(v, str) and v in local_args:
                        if v not in local_vars:
                            raise ValueError(f"Variable '{v}' was defined in 'local_args', but was not found in local variables. Make sure to set the correct local_args.")
                        else:
                            resolved_args[k] = local_vars[v]
                    else:
                        resolved_args[k] = v
                col_stats[func_name] = func(**resolved_args)

        col_stats_df = pd.DataFrame(col_stats, index=[pred_col])
        if self.show_detailed_progress_bar:
            pbar.update(1)
            pbar.close()
        return col_stats_df

    def per_era_corrs(
        self, dataf: pd.DataFrame, pred_col: str, target_col: str
    ) -> pd.Series:
        """Correlation between prediction and target for each era."""
        return dataf.groupby(self.era_col).apply(
            lambda d: self._normalize_uniform(d[pred_col].fillna(0.5)).corr(
                d[target_col]
            )
        )

    def per_era_numerai_corrs(
        self, dataf: pd.DataFrame, pred_col: str, target_col: str
    ) -> pd.Series:
        """Numerai Corr between prediction and target for each era."""
        return dataf.groupby(self.era_col).apply(
            lambda d: self.numerai_corr(d.fillna(0.5), pred_col, target_col)
        )

    def mean_std_sharpe(
        self, era_corrs: pd.Series
    ) -> Tuple[np.float64, np.float64, np.float64]:
        """
        Average, standard deviation and Sharpe ratio for
        correlations per era.
        """
        mean = pd.Series(era_corrs.mean()).item()
        std = pd.Series(era_corrs.std(ddof=0)).item()
        sharpe = np.nan if std == 0 else mean / std
        return mean, std, sharpe

    def numerai_corr(
        self, dataf: pd.DataFrame, pred_col: str, target_col: str
    ) -> np.float64:
        """
        Computes 'Numerai Corr' aka 'Corrv2'.
        More info: https://forum.numer.ai/t/target-cyrus-new-primary-target/6303

        Assumes original target col as input (i.e. in [0, 1] range).
        """
        # Rank and gaussianize predictions
        ranked_preds = self._normalize_uniform(
            dataf[pred_col].fillna(0.5), method="average"
        )
        gauss_ranked_preds = stats.norm.ppf(ranked_preds)
        # Center target from [0...1] to [-0.5...0.5] range
        targets = dataf[target_col]
        centered_target = targets - targets.mean()
        # Accentuate tails of predictions and targets
        preds_p15 = np.sign(gauss_ranked_preds) * np.abs(gauss_ranked_preds) ** 1.5
        target_p15 = np.sign(centered_target) * np.abs(centered_target) ** 1.5
        # Pearson correlation
        corr, _ = stats.pearsonr(preds_p15, target_p15)
        return corr

    @staticmethod
    def max_drawdown(era_corrs: pd.Series) -> np.float64:
        """Maximum drawdown per era."""
        # Arbitrarily large window
        rolling_max = (
            (era_corrs + 1).cumprod().rolling(window=9000, min_periods=1).max()
        )
        daily_value = (era_corrs + 1).cumprod()
        max_drawdown = -((rolling_max - daily_value) / rolling_max).max()
        return max_drawdown

    @staticmethod
    def apy(era_corrs: pd.Series, stake_compounding_lag: int = 4) -> np.float64:
        """
        Annual percentage yield.
        :param era_corrs: Correlation scores by era
        :param stake_compounding_lag: Compounding lag for Numerai rounds (4 for Numerai Classic)
        """
        payout_scores = era_corrs.clip(-0.25, 0.25)
        payout_product = (payout_scores + 1).prod()
        return (
            payout_product
            ** (
                # 52 weeks of compounding minus n for stake compounding lag
                (52 - stake_compounding_lag)
                / len(payout_scores)
            )
            - 1
        ) * 100

    def cross_correlation(self, dataf: pd.DataFrame, pred_col: str, other_col: str):
        """
        Corrv2 correlation with other predictions (like another model, example predictions or meta model prediction).
        :param dataf: DataFrame containing both pred_col and other_col.
        :param pred_col: Main Prediction.
        :param other_col: Other prediction column to calculate correlation with pred_col.

        :return: Correlation between Corrv2's of pred_col and other_col.
        """
        return self.per_era_numerai_corrs(
            dataf=dataf,
            pred_col=pred_col,
            target_col=other_col,
        ).mean()

    def max_feature_exposure(
        self, dataf: pd.DataFrame, feature_cols: List[str], pred_col: str
    ) -> np.float64:
        """Maximum exposure over all features."""
        max_per_era = dataf.groupby(self.era_col).apply(
            lambda d: d[feature_cols].corrwith(d[pred_col]).abs().max()
        )
        max_feature_exposure = max_per_era.mean(skipna=True)
        return max_feature_exposure

    def feature_neutral_mean_std_sharpe(
        self, dataf: pd.DataFrame, pred_col: str, target_col: str, feature_names: list
    ) -> Tuple[np.float64, np.float64, np.float64]:
        """
        Feature neutralized mean performance.
        More info: https://docs.numer.ai/tournament/feature-neutral-correlation
        """
        fn = FeatureNeutralizer(pred_name=pred_col, proportion=1.0)
        fn.set_predict_request(features=True, era_series=True)
        neutralized_preds = fn.predict(
            dataf[pred_col], features=dataf[feature_names], era_series=dataf[self.era_col]
        )
        # Construct new DataFrame with era col, target col and preds
        neutralized_dataf = pd.DataFrame(columns=[self.era_col, target_col, pred_col])
        neutralized_dataf[self.era_col] = dataf[self.era_col]
        neutralized_dataf[target_col] = dataf[target_col]
        neutralized_dataf[pred_col] = neutralized_preds

        neutral_corrs = self.per_era_numerai_corrs(
            dataf=neutralized_dataf,
            pred_col=pred_col,
            target_col=target_col,
        )
        mean, std, sharpe = self.mean_std_sharpe(era_corrs=neutral_corrs)
        return mean, std, sharpe

    def tbx_mean_std_sharpe(
        self, dataf: pd.DataFrame, pred_col: str, target_col: str, tb: int = 200
    ) -> Tuple[np.float64, np.float64, np.float64]:
        """
        Calculate Mean, Standard deviation and Sharpe ratio
        when we focus on the x top and x bottom predictions.
        :param tb: How many of top and bottom predictions to focus on.
        TB200 and TB500 are the most common situations.
        """
        tb_val_corrs = self._score_by_date(
            dataf=dataf, columns=[pred_col], target=target_col, tb=tb
        )
        return self.mean_std_sharpe(era_corrs=tb_val_corrs)

    def exposure_dissimilarity(
        self, dataf: pd.DataFrame, pred_col: str, other_col: str, corr_method: str = "pearson"
    ) -> np.float32:
        """
        Model pattern of feature exposure to the another column.
        See TC details forum post: https://forum.numer.ai/t/true-contribution-details/5128/4
        :param dataf: DataFrame containing both pred_col and other_col.
        :param pred_col: Main Prediction.
        :param other_col: Other prediction column to calculate exposure dissimilarity against.
        :param corr_method: Correlation method to use for calculating feature exposures.
        corr_method should be one of ['pearson', 'kendall', 'spearman']. Default: 'pearson'.
        """
        assert corr_method in ["pearson", "kendall", "spearman"], f"corr_method should be one of ['pearson', 'kendall', 'spearman']. Got: '{corr_method}'"
        feature_cols = [col for col in dataf.columns if col.startswith("feature")]
        U = dataf[feature_cols].corrwith(dataf[pred_col], method=corr_method).values
        E = dataf[feature_cols].corrwith(dataf[other_col], method=corr_method).values

        denominator = np.dot(E, E)
        if denominator == 0:
            exp_dis = 0
        else:
            exp_dis = 1 - np.dot(U, E) / denominator
        return exp_dis

    @staticmethod
    def _neutralize_series(
        series: pd.Series, by: pd.Series, proportion=1.0
    ) -> pd.Series:
        scores = series.values.reshape(-1, 1)
        exposures = by.values.reshape(-1, 1)

        # This line makes series neutral to a constant column so that it's centered and for sure gets corr 0 with exposures
        exposures = np.hstack(
            (exposures, np.array([np.nanmean(series)] * len(exposures)).reshape(-1, 1))
        )

        correction = proportion * (
            exposures.dot(np.linalg.lstsq(exposures, scores, rcond=None)[0])
        )
        corrected_scores = scores - correction
        neutralized = pd.Series(corrected_scores.ravel(), index=series.index)
        return neutralized

    @staticmethod
    def _orthogonalize(v: np.ndarray, u: np.ndarray) -> np.ndarray:
        """Orthogonalizes v with respect to u by projecting v onto u,
        then subtracting that projection from v.

        This will reach the same result as the neutralize function when v and u
        are single column vectors, but this is much faster.

        Arguments:
            v: np.ndarray - the vector to orthogonalize
            u: np.ndarray - the vector to orthogonalize v against

        Returns:
            np.ndarray - the orthogonalized vector v
        """
        # Calculate the dot product of u and v
        dot_product = u.T @ v

        # Calculate the projection of v onto u
        projection = (dot_product / (u.T @ u)) * u

        # Subtract the projection from v
        return v - projection

    def _score_by_date(
        self, dataf: pd.DataFrame, columns: list, target: str, tb: int = None
    ):
        """
        Get era correlation based on given TB (x top and bottom predictions).
        :param tb: How many of top and bottom predictions to focus on.
        TB200 is the most common situation.
        """
        unique_eras = dataf[self.era_col].unique()
        computed = []
        for u in unique_eras:
            df_era = dataf[dataf[self.era_col] == u]
            era_pred = np.float64(df_era[columns].values.T)
            era_target = np.float64(df_era[target].values.T)

            if tb is None:
                ccs = np.corrcoef(era_target, era_pred)[0, 1:]
            else:
                tbidx = np.argsort(era_pred, axis=1)
                tbidx = np.concatenate([tbidx[:, :tb], tbidx[:, -tb:]], axis=1)
                ccs = [
                    np.corrcoef(era_target[idx], pred[idx])[0, 1]
                    for idx, pred in zip(tbidx, era_pred)
                ]
                ccs = np.array(ccs)
            computed.append(ccs)
        return pd.DataFrame(
            np.array(computed), columns=columns, index=dataf[self.era_col].unique()
        )

    @staticmethod
    def _normalize_uniform(df: pd.DataFrame, method: str = "first") -> pd.Series:
        """
        Normalize predictions uniformly using ranks.
        NOTE: Make sure the range of predictions is [0, 1] (inclusive).
        """
        x = (df.rank(method=method) - 0.5) / len(
            df
        )  # TODO: Evaluate if subtracting df.mean() is better
        return pd.Series(x, index=df.index)

    def get_feature_exposures_pearson(
        self,
        dataf: pd.DataFrame,
        pred_col: str,
        feature_list: List[str],
        cpu_cores: int = -1,
    ) -> pd.DataFrame:
        """
        Calculate feature exposures for each era using Pearson correlation.

        :param dataf: DataFrame containing predictions, features, and eras.
        :param pred_col: Prediction column to calculate feature exposures for.
        :param feature_list: List of feature columns in X.
        :param cpu_cores: Number of CPU cores to use for parallelization.
        :return: DataFrame with Pearson feature exposures by era for each feature.
        """

        def calculate_era_pearson_exposure(
            era, group, feature_list, pred_col_normalized
        ):
            data_matrix = group[feature_list + [pred_col_normalized]].values
            correlations = np.corrcoef(data_matrix, rowvar=False)

            # Get the correlations of all features with the predictions (which is the last column)
            feature_correlations = correlations[:-1, -1]
            return era, feature_correlations

        normalized_ranks = (dataf[[pred_col]].rank(method="first") - 0.5) / len(dataf)
        dataf[f"{pred_col}_normalized"] = stats.norm.ppf(normalized_ranks)
        feature_exposure_data = pd.DataFrame(
            index=dataf["era"].unique(), columns=feature_list
        )

        grouped_data = list(dataf.groupby("era"))

        results = Parallel(n_jobs=cpu_cores)(
            delayed(calculate_era_pearson_exposure)(
                era, group, feature_list, f"{pred_col}_normalized"
            )
            for era, group in grouped_data
        )

        for era, feature_correlations in results:
            feature_exposure_data.loc[era, :] = feature_correlations
        return feature_exposure_data

    def get_feature_exposures_corrv2(
        self,
        dataf: pd.DataFrame,
        pred_col: str,
        feature_list: List[str],
        cpu_cores: int = -1,
    ) -> pd.DataFrame:
        """
        Calculate feature exposures for each era using 'Numerai Corr'.
        Results will be similar to get_feature_exposures() but more accurate.
        This method will take longer to compute.

        :param dataf: DataFrame containing predictions, features, and eras.
        :param pred_col: Prediction column to calculate feature exposures for.
        :param feature_list: List of feature columns in X.
        :param cpu_cores: Number of CPU cores to use for parallelization.
        Default: -1 (all cores).
        :return: DataFrame with Corrv2 feature exposures by era for each feature.
        """

        def calculate_era_feature_exposure(era, group, pred_col, feature_list):
            exposures = {}
            for feature in feature_list:
                corr = self.numerai_corr(
                    group, pred_col=f"{pred_col}_normalized", target_col=feature
                )
                exposures[feature] = corr
            return era, exposures

        normalized_ranks = (dataf[[pred_col]].rank(method="first") - 0.5) / len(dataf)
        dataf[f"{pred_col}_normalized"] = stats.norm.ppf(normalized_ranks)
        feature_exposure_data = pd.DataFrame(
            index=dataf["era"].unique(), columns=feature_list
        )

        grouped_data = list(dataf.groupby("era"))

        results = Parallel(n_jobs=cpu_cores)(
            delayed(calculate_era_feature_exposure)(era, group, pred_col, feature_list)
            for era, group in grouped_data
        )
        for era, exposures in results:
            feature_exposure_data.loc[era, :] = exposures
        return feature_exposure_data

    def smart_sharpe(self, era_corrs: pd.Series) -> np.float64:
        """
        Sharpe adjusted for autocorrelation.
        :param era_corrs: Correlation scores by era
        """
        return np.nanmean(era_corrs) / (
            np.nanstd(era_corrs, ddof=1) * self.autocorr_penalty(era_corrs)
        )

    def autocorr_penalty(self, era_corrs: pd.Series) -> np.float64:
        """
        Adjusting factor for autocorrelation. Used in Smart Sharpe.
        :param era_corrs: Correlation scores by era.
        """
        n = len(era_corrs)
        # 1st order autocorrelation
        p = self.autocorr1(era_corrs)
        return np.sqrt(1 + 2 * np.sum([((n - i) / n) * p**i for i in range(1, n)]))

    def autocorr1(self, era_corrs: pd.Series) -> np.float64:
        """
        1st order autocorrelation.
        :param era_corrs: Correlation scores by era.
        """
        return np.corrcoef(era_corrs[:-1], era_corrs[1:])[0, 1]

    def legacy_contribution(
        self, dataf: pd.DataFrame, pred_col: str, target_col: str, other_col: str
    ):
        """
        Legacy contibution mean, standard deviation and sharpe ratio.
        More info: https://forum.numer.ai/t/mmc2-announcement/93

        :param dataf: DataFrame containing era_col, pred_col, target_col and other_col.
        :param pred_col: Prediction column to calculate MMC for.
        :param target_col: Target column to calculate MMC against.
        :param other_col: Meta model column containing predictions to neutralize against.

        :return: List of legacy contribution scores by era.
        """
        legacy_mc_scores = []
        # Standard deviation of a uniform distribution
        COVARIANCE_FACTOR = 0.29**2
        # Calculate MMC for each era
        for _, x in dataf.groupby(self.era_col):
            series = self._neutralize_series(
                self._normalize_uniform(x[pred_col]), (x[other_col])
            )
            legacy_mc_scores.append(
                np.cov(series, x[target_col])[0, 1] / COVARIANCE_FACTOR
            )

        return legacy_mc_scores

    def contributive_correlation(
        self, dataf: pd.DataFrame, pred_col: str, target_col: str, other_col: str
    ) -> np.array:
        """Calculate the contributive correlation of the given predictions
        wrt the given meta model.
        see: https://docs.numer.ai/numerai-tournament/scoring/meta-model-contribution-mmc-and-bmc

        Uses Numerai's official scoring function for contribution under the hood.
        See: https://github.com/numerai/numerai-tools/blob/master/numerai_tools/scoring.py

        Calculate contributive correlation by:
        1. tie-kept ranking each prediction and the meta model
        2. gaussianizing each prediction and the meta model
        3. orthogonalizing each prediction wrt the meta model
        3.5. scaling the targets to buckets [-2, -1, 0, 1, 2]
        4. dot product the orthogonalized predictions and the targets
       then normalize by the length of the target (equivalent to covariance)

        :param dataf: DataFrame containing era_col, pred_col, target_col and other_col.
        :param pred_col: Prediction column to calculate MMC for.
        :param target_col: Target column to calculate MMC against.
        Make sure the range of targets is [0, 1] (inclusive). 
        If the function is called from full_evalation, this is guaranteed because of the checks.
        :param other_col: Meta model column containing predictions to neutralize against.

        :return: A 1D NumPy array of contributive correlations by era.
        """
        mc_scores = []
        for _, x in dataf.groupby(self.era_col):
            mc = correlation_contribution(x[[pred_col]], 
                                          x[other_col], 
                                          x[target_col])
            mc_scores.append(mc)
        return np.array(mc_scores).ravel()

    def check_custom_functions(self):
        if not isinstance(self.custom_functions, dict):
            raise ValueError("custom_functions must be a dictionary")

        for func_name, func_info in self.custom_functions.items():
            if not isinstance(func_info, dict) or 'func' not in func_info or 'args' not in func_info:
                raise ValueError(f"Function {func_name} must have a 'func' and 'args' key")

            if not callable(func_info['func']):
                raise ValueError(f"The 'func' value for '{func_name}' in custom_functions must be a callable function.")

            if not isinstance(func_info['args'], dict):
                raise ValueError(f"'args' for '{func_name}' in custom_functions must be a dictionary")

            if "local_args" in func_info:
                if not isinstance(func_info['local_args'], list):
                    raise ValueError(f"The 'local_args' key for {func_name} in custom_functionsmust be a list")
                for local_arg in func_info['local_args']:
                    if not isinstance(local_arg, str):
                        raise ValueError(f"Local arg '{local_arg}' for '{func_name}' in custom_functions must be string.")
                    if local_arg not in list(func_info['args'].keys()):
                        raise ValueError(f"Local arg '{local_arg}' for '{func_name}' in custom_functions was not found in 'args'")

    def plot_correlations(
        self,
        dataf: pd.DataFrame,
        pred_cols: List[str],
        corr_cols: list = None,
        target_col: str = "target",
        roll_mean: int = 20,
    ):
        """
        Plot per era correlations over time.
        :param dataf: DataFrame that contains at least all pred_cols, target_col and corr_cols.
        :param pred_cols: List of prediction columns to calculate per era correlations for and plot.
        :param corr_cols: Per era correlations already prepared to include in the plot.
        This is optional for if you already have per era correlations prepared in your input dataf.
        :param target_col: Target column name to compute per era correlations against.
        :param roll_mean: How many eras should be averaged to compute a rolling score.
        """
        validation_by_eras = pd.DataFrame()
        # Compute per era correlation for each prediction column.
        for pred_col in pred_cols:
            per_era_corrs = self.per_era_numerai_corrs(
                dataf, pred_col=pred_col, target_col=target_col
            )
            validation_by_eras.loc[:, pred_col] = per_era_corrs

        # Add prepared per era correlation if any.
        if corr_cols is not None:
            for corr_col in corr_cols:
                validation_by_eras.loc[:, corr_col] = dataf[corr_col]

        validation_by_eras.rolling(roll_mean).mean().plot(
            kind="line",
            marker="o",
            ms=4,
            title=f"Rolling Per Era Correlation Mean (rolling window size: {roll_mean})",
            figsize=(15, 5),
        )
        plt.legend(
            loc="upper center",
            bbox_to_anchor=(0.5, -0.05),
            fancybox=True,
            shadow=True,
            ncol=1,
        )
        plt.axhline(y=0.0, color="r", linestyle="--")
        plt.show()

        validation_by_eras.cumsum().plot(
            title="Cumulative Sum of Era Correlations", figsize=(15, 5)
        )
        plt.legend(
            loc="upper center",
            bbox_to_anchor=(0.5, -0.05),
            fancybox=True,
            shadow=True,
            ncol=1,
        )
        plt.axhline(y=0.0, color="r", linestyle="--")
        plt.show()
        return

    @staticmethod
    def plot_correlation_heatmap(dataf: pd.DataFrame, pred_cols: List[str]):
        corr_matrix = dataf[pred_cols].corr().to_numpy()

        plt.figure(figsize=(20, 20))

        # Create heatmap
        plt.imshow(corr_matrix, cmap="coolwarm", interpolation="none")
        plt.colorbar()

        # Add ticks and labels
        ticks = np.arange(0, len(pred_cols), 1)
        plt.xticks(ticks, pred_cols, rotation=90, fontsize=8)
        plt.yticks(ticks, pred_cols, fontsize=8)

        plt.show()
        return

apy(era_corrs, stake_compounding_lag=4) staticmethod

Annual percentage yield. :param era_corrs: Correlation scores by era :param stake_compounding_lag: Compounding lag for Numerai rounds (4 for Numerai Classic)

Source code in numerblox/evaluation.py
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
@staticmethod
def apy(era_corrs: pd.Series, stake_compounding_lag: int = 4) -> np.float64:
    """
    Annual percentage yield.
    :param era_corrs: Correlation scores by era
    :param stake_compounding_lag: Compounding lag for Numerai rounds (4 for Numerai Classic)
    """
    payout_scores = era_corrs.clip(-0.25, 0.25)
    payout_product = (payout_scores + 1).prod()
    return (
        payout_product
        ** (
            # 52 weeks of compounding minus n for stake compounding lag
            (52 - stake_compounding_lag)
            / len(payout_scores)
        )
        - 1
    ) * 100

autocorr1(era_corrs)

1st order autocorrelation. :param era_corrs: Correlation scores by era.

Source code in numerblox/evaluation.py
768
769
770
771
772
773
def autocorr1(self, era_corrs: pd.Series) -> np.float64:
    """
    1st order autocorrelation.
    :param era_corrs: Correlation scores by era.
    """
    return np.corrcoef(era_corrs[:-1], era_corrs[1:])[0, 1]

autocorr_penalty(era_corrs)

Adjusting factor for autocorrelation. Used in Smart Sharpe. :param era_corrs: Correlation scores by era.

Source code in numerblox/evaluation.py
758
759
760
761
762
763
764
765
766
def autocorr_penalty(self, era_corrs: pd.Series) -> np.float64:
    """
    Adjusting factor for autocorrelation. Used in Smart Sharpe.
    :param era_corrs: Correlation scores by era.
    """
    n = len(era_corrs)
    # 1st order autocorrelation
    p = self.autocorr1(era_corrs)
    return np.sqrt(1 + 2 * np.sum([((n - i) / n) * p**i for i in range(1, n)]))

contributive_correlation(dataf, pred_col, target_col, other_col)

Calculate the contributive correlation of the given predictions wrt the given meta model. see: https://docs.numer.ai/numerai-tournament/scoring/meta-model-contribution-mmc-and-bmc

Uses Numerai's official scoring function for contribution under the hood. See: https://github.com/numerai/numerai-tools/blob/master/numerai_tools/scoring.py

Calculate contributive correlation by: 1. tie-kept ranking each prediction and the meta model 2. gaussianizing each prediction and the meta model 3. orthogonalizing each prediction wrt the meta model 3.5. scaling the targets to buckets [-2, -1, 0, 1, 2] 4. dot product the orthogonalized predictions and the targets then normalize by the length of the target (equivalent to covariance)

:param dataf: DataFrame containing era_col, pred_col, target_col and other_col. :param pred_col: Prediction column to calculate MMC for. :param target_col: Target column to calculate MMC against. Make sure the range of targets is [0, 1] (inclusive). If the function is called from full_evalation, this is guaranteed because of the checks. :param other_col: Meta model column containing predictions to neutralize against.

:return: A 1D NumPy array of contributive correlations by era.

Source code in numerblox/evaluation.py
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
def contributive_correlation(
    self, dataf: pd.DataFrame, pred_col: str, target_col: str, other_col: str
) -> np.array:
    """Calculate the contributive correlation of the given predictions
    wrt the given meta model.
    see: https://docs.numer.ai/numerai-tournament/scoring/meta-model-contribution-mmc-and-bmc

    Uses Numerai's official scoring function for contribution under the hood.
    See: https://github.com/numerai/numerai-tools/blob/master/numerai_tools/scoring.py

    Calculate contributive correlation by:
    1. tie-kept ranking each prediction and the meta model
    2. gaussianizing each prediction and the meta model
    3. orthogonalizing each prediction wrt the meta model
    3.5. scaling the targets to buckets [-2, -1, 0, 1, 2]
    4. dot product the orthogonalized predictions and the targets
   then normalize by the length of the target (equivalent to covariance)

    :param dataf: DataFrame containing era_col, pred_col, target_col and other_col.
    :param pred_col: Prediction column to calculate MMC for.
    :param target_col: Target column to calculate MMC against.
    Make sure the range of targets is [0, 1] (inclusive). 
    If the function is called from full_evalation, this is guaranteed because of the checks.
    :param other_col: Meta model column containing predictions to neutralize against.

    :return: A 1D NumPy array of contributive correlations by era.
    """
    mc_scores = []
    for _, x in dataf.groupby(self.era_col):
        mc = correlation_contribution(x[[pred_col]], 
                                      x[other_col], 
                                      x[target_col])
        mc_scores.append(mc)
    return np.array(mc_scores).ravel()

cross_correlation(dataf, pred_col, other_col)

Corrv2 correlation with other predictions (like another model, example predictions or meta model prediction). :param dataf: DataFrame containing both pred_col and other_col. :param pred_col: Main Prediction. :param other_col: Other prediction column to calculate correlation with pred_col.

:return: Correlation between Corrv2's of pred_col and other_col.

Source code in numerblox/evaluation.py
485
486
487
488
489
490
491
492
493
494
495
496
497
498
def cross_correlation(self, dataf: pd.DataFrame, pred_col: str, other_col: str):
    """
    Corrv2 correlation with other predictions (like another model, example predictions or meta model prediction).
    :param dataf: DataFrame containing both pred_col and other_col.
    :param pred_col: Main Prediction.
    :param other_col: Other prediction column to calculate correlation with pred_col.

    :return: Correlation between Corrv2's of pred_col and other_col.
    """
    return self.per_era_numerai_corrs(
        dataf=dataf,
        pred_col=pred_col,
        target_col=other_col,
    ).mean()

evaluation_one_col(dataf, feature_cols, pred_col, target_col, benchmark_cols=None)

Perform evaluation for one prediction column against given target and benchmark column(s).

Source code in numerblox/evaluation.py
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
def evaluation_one_col(
    self,
    dataf: pd.DataFrame,
    feature_cols: list,
    pred_col: str,
    target_col: str,
    benchmark_cols: list = None,
):
    """
    Perform evaluation for one prediction column
    against given target and benchmark column(s).
    """
    assert (
        self.era_col in dataf.columns
    ), f"Era column '{self.era_col}' not found in DataFrame. Make sure to set the correct era_col."
    assert (
            pred_col in dataf.columns
        ), f"Prediction column '{pred_col}' not found in DataFrame. Make sure to set the correct pred_col."
    assert (
        target_col in dataf.columns
    ), f"Target column '{target_col}' not found in DataFrame. Make sure to set the correct target_col."
    if benchmark_cols:
        for col in benchmark_cols:
            assert (
                col in dataf.columns
            ), f"Benchmark column '{col}' not found in DataFrame. Make sure to set the correct benchmark_cols."

    # Check that all values are between 0 and 1
    assert (
        dataf[pred_col].min().min() >= 0 and dataf[pred_col].max().max() <= 1
    ), "All predictions should be between 0 and 1 (inclusive)."
    assert (
        dataf[target_col].min() >= 0 and dataf[target_col].max() <= 1
    ), "All targets should be between 0 and 1 (inclusive)."
    if benchmark_cols is not None:
        for col in benchmark_cols:
            assert (
                dataf[col].min() >= 0 and dataf[col].max() <= 1
            ), f"All predictions for '{col}' should be between 0 and 1 (inclusive)."

    if self.show_detailed_progress_bar:
        len_metrics_list = len(self.metrics_list)
        len_benchmark_cols = 0 if benchmark_cols is None else len(benchmark_cols)
        len_custom_functions = 0 if self.custom_functions is None else len(list(self.custom_functions.keys()))
        len_pbar = len_metrics_list + len_benchmark_cols + len_custom_functions
        pbar = tqdm(total=len_pbar, desc="Evaluation")

    col_stats = {}
    col_stats["target"] = target_col

    # Compute stats per era (only if needed)
    per_era_numerai_corrs = self.per_era_numerai_corrs(
        dataf=dataf, pred_col=pred_col, target_col=target_col
    )

    # check if mean, std, or sharpe are in metrics_list
    if "mean_std_sharpe" in self.metrics_list:
        if self.show_detailed_progress_bar:
            pbar.set_description_str(f"mean_std_sharpe for evaluation")
            pbar.update(1)
        mean, std, sharpe = self.mean_std_sharpe(era_corrs=per_era_numerai_corrs)
        col_stats["mean"] = mean
        col_stats["std"] = std
        col_stats["sharpe"] = sharpe

    if "legacy_mean_std_sharpe" in self.metrics_list:
        if self.show_detailed_progress_bar:
            pbar.set_description_str(f"legacy_mean_std_sharpe for evaluation")
            pbar.update(1)
        per_era_corrs = self.per_era_corrs(
            dataf=dataf, pred_col=pred_col, target_col=target_col
        )
        legacy_mean, legacy_std, legacy_sharpe = self.mean_std_sharpe(
            era_corrs=per_era_corrs
        )
        col_stats["legacy_mean"] = legacy_mean
        col_stats["legacy_std"] = legacy_std
        col_stats["legacy_sharpe"] = legacy_sharpe

    if "max_drawdown" in self.metrics_list:
        if self.show_detailed_progress_bar:
            pbar.set_description_str(f"max_drawdown for evaluation")
            pbar.update(1)
        col_stats["max_drawdown"] = self.max_drawdown(
            era_corrs=per_era_numerai_corrs
        )

    if "apy":
        if self.show_detailed_progress_bar:
            pbar.set_description_str(f"apy for evaluation")
            pbar.update(1)
        col_stats["apy"] = self.apy(era_corrs=per_era_numerai_corrs)

    if "calmar_ratio" in self.metrics_list:
        if self.show_detailed_progress_bar:
            pbar.set_description_str(f"calmar_ratio for evaluation")
            pbar.update(1)
        if not "max_drawdown" in self.metrics_list:
            col_stats["max_drawdown"] = self.max_drawdown(
                era_corrs=per_era_numerai_corrs
            )
        if not "apy" in self.metrics_list:
            col_stats["apy"] = self.apy(era_corrs=per_era_numerai_corrs)
        col_stats["calmar_ratio"] = (
            np.nan
            if col_stats["max_drawdown"] == 0
            else col_stats["apy"] / -col_stats["max_drawdown"]
        )

    if "autocorrelation" in self.metrics_list:
        if self.show_detailed_progress_bar:
            pbar.set_description(f"autocorrelation for evaluation")
            pbar.update(1)
        col_stats["autocorrelation"] = self.autocorr1(per_era_numerai_corrs)

    if "max_feature_exposure" in self.metrics_list:
        if self.show_detailed_progress_bar:
            pbar.set_description_str(f"max_feature_exposure for evaluation")
            pbar.update(1)
        col_stats["max_feature_exposure"] = self.max_feature_exposure(
            dataf=dataf, feature_cols=feature_cols, pred_col=pred_col
        )

    if "smart_sharpe" in self.metrics_list:
        if self.show_detailed_progress_bar:
            pbar.set_description_str(f"smart_sharpe for evaluation")
            pbar.update(1)
        col_stats["smart_sharpe"] = self.smart_sharpe(
            era_corrs=per_era_numerai_corrs
        )

    if benchmark_cols is not None:
        for bench_col in benchmark_cols:
            if self.show_detailed_progress_bar:
                pbar.set_description_str(f"Evaluation for benchmark column: '{bench_col}'")
                pbar.update(1)

            per_era_bench_corrs = self.per_era_numerai_corrs(
                dataf=dataf, pred_col=bench_col, target_col=target_col
            )

            if "mean_std_sharpe" in self.metrics_list:
                if self.show_detailed_progress_bar:
                    pbar.set_description_str(f"mean_std_sharpe for benchmark column: '{bench_col}'")
                bench_mean, bench_std, bench_sharpe = self.mean_std_sharpe(
                    era_corrs=per_era_bench_corrs
                )
                col_stats[f"mean_vs_{bench_col}"] = mean - bench_mean
                col_stats[f"std_vs_{bench_col}"] = std - bench_std
                col_stats[f"sharpe_vs_{bench_col}"] = sharpe - bench_sharpe

            if "mc_mean_std_sharpe" in self.metrics_list:
                if self.show_detailed_progress_bar:
                    pbar.set_description_str(f"mc_mean_std_sharpe for benchmark column: '{bench_col}'")
                mc_scores = self.contributive_correlation(
                    dataf=dataf,
                    pred_col=pred_col,
                    target_col=target_col,
                    other_col=bench_col,
                )
                col_stats[f"mc_mean_{bench_col}"] = np.nanmean(mc_scores)
                col_stats[f"mc_std_{bench_col}"] = np.nanstd(mc_scores)
                col_stats[f"mc_sharpe_{bench_col}"] = (
                    np.nan
                    if col_stats[f"mc_std_{bench_col}"] == 0
                    else col_stats[f"mc_mean_{bench_col}"]
                    / col_stats[f"mc_std_{bench_col}"]
                )

            if "corr_with" in self.metrics_list:
                if self.show_detailed_progress_bar:
                    pbar.set_description_str(f"corr_with for benchmark column: '{bench_col}'")
                col_stats[f"corr_with_{bench_col}"] = self.cross_correlation(
                    dataf=dataf, pred_col=bench_col, other_col=bench_col
                )

            if "legacy_mc_mean_std_sharpe" in self.metrics_list:
                if self.show_detailed_progress_bar:
                    pbar.set_description_str(f"legacy_mc_mean_std_sharpe for benchmark column: '{bench_col}'")
                legacy_mc_scores = self.legacy_contribution(
                    dataf=dataf,
                    pred_col=pred_col,
                    target_col=target_col,
                    other_col=bench_col,
                )
                col_stats[f"legacy_mc_mean_{bench_col}"] = np.nanmean(
                    legacy_mc_scores
                )
                col_stats[f"legacy_mc_std_{bench_col}"] = np.nanstd(
                    legacy_mc_scores
                )
                col_stats[f"legacy_mc_sharpe_{bench_col}"] = (
                    np.nan
                    if col_stats[f"legacy_mc_std_{bench_col}"] == 0
                    else col_stats[f"legacy_mc_mean_{bench_col}"]
                    / col_stats[f"legacy_mc_std_{bench_col}"]
                )

            if "ex_diss" in self.metrics_list or "ex_diss_pearson" in self.metrics_list:
                if self.show_detailed_progress_bar:
                    pbar.set_description_str(f"ex_diss_pearson for benchmark column: '{bench_col}'")
                col_stats[
                    f"exposure_dissimilarity_pearson_{bench_col}"
                ] = self.exposure_dissimilarity(
                    dataf=dataf, pred_col=pred_col, other_col=bench_col,
                    corr_method="pearson"
                )
            if "ex_diss_spearman" in self.metrics_list:
                if self.show_detailed_progress_bar:
                    pbar.set_description_str(f"ex_diss_spearman for benchmark column: '{bench_col}'")
                col_stats[
                    f"exposure_dissimilarity_spearman_{bench_col}"
                ] = self.exposure_dissimilarity(
                    dataf=dataf, pred_col=pred_col, other_col=bench_col,
                    corr_method="spearman"
                )

    # Compute intensive stats
    if "fn_mean_std_sharpe" in self.metrics_list:
        if self.show_detailed_progress_bar:
            pbar.set_description_str(f"fn_mean_std_sharpe for evaluation")
            pbar.update(1)
        fn_mean, fn_std, fn_sharpe = self.feature_neutral_mean_std_sharpe(
            dataf=dataf,
            pred_col=pred_col,
            target_col=target_col,
            feature_names=feature_cols,
        )
        col_stats["feature_neutral_mean"] = fn_mean
        col_stats["feature_neutral_std"] = fn_std
        col_stats["feature_neutral_sharpe"] = fn_sharpe

    if "tb200_mean_std_sharpe" in self.metrics_list:
        if self.show_detailed_progress_bar:
            pbar.set_description_str(f"tb200_mean_std_sharpe for evaluation")
            pbar.update(1)
        tb200_mean, tb200_std, tb200_sharpe = self.tbx_mean_std_sharpe(
            dataf=dataf, pred_col=pred_col, target_col=target_col, tb=200
        )
        col_stats["tb200_mean"] = tb200_mean
        col_stats["tb200_std"] = tb200_std
        col_stats["tb200_sharpe"] = tb200_sharpe

    if "tb500_mean_std_sharpe" in self.metrics_list:
        if self.show_detailed_progress_bar:
            pbar.set_description_str(f"tb500_mean_std_sharpe for evaluation")
            pbar.update(1)
        tb500_mean, tb500_std, tb500_sharpe = self.tbx_mean_std_sharpe(
            dataf=dataf, pred_col=pred_col, target_col=target_col, tb=500
        )
        col_stats["tb500_mean"] = tb500_mean
        col_stats["tb500_std"] = tb500_std
        col_stats["tb500_sharpe"] = tb500_sharpe

    # Custom functions
    if self.custom_functions is not None:
        local_vars = locals()
        for func_name, func_info in self.custom_functions.items():
            if self.show_detailed_progress_bar:
                pbar.set_description_str(f"custom function: '{func_name}' for evaluation")
                pbar.update(1)
            func = func_info['func']
            args = func_info['args']
            local_args = func_info['local_args']
            resolved_args = {}
            for k, v in args.items():
                # Resolve variables defined as local args
                if isinstance(v, str) and v in local_args:
                    if v not in local_vars:
                        raise ValueError(f"Variable '{v}' was defined in 'local_args', but was not found in local variables. Make sure to set the correct local_args.")
                    else:
                        resolved_args[k] = local_vars[v]
                else:
                    resolved_args[k] = v
            col_stats[func_name] = func(**resolved_args)

    col_stats_df = pd.DataFrame(col_stats, index=[pred_col])
    if self.show_detailed_progress_bar:
        pbar.update(1)
        pbar.close()
    return col_stats_df

exposure_dissimilarity(dataf, pred_col, other_col, corr_method='pearson')

Model pattern of feature exposure to the another column. See TC details forum post: https://forum.numer.ai/t/true-contribution-details/5128/4 :param dataf: DataFrame containing both pred_col and other_col. :param pred_col: Main Prediction. :param other_col: Other prediction column to calculate exposure dissimilarity against. :param corr_method: Correlation method to use for calculating feature exposures. corr_method should be one of ['pearson', 'kendall', 'spearman']. Default: 'pearson'.

Source code in numerblox/evaluation.py
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
def exposure_dissimilarity(
    self, dataf: pd.DataFrame, pred_col: str, other_col: str, corr_method: str = "pearson"
) -> np.float32:
    """
    Model pattern of feature exposure to the another column.
    See TC details forum post: https://forum.numer.ai/t/true-contribution-details/5128/4
    :param dataf: DataFrame containing both pred_col and other_col.
    :param pred_col: Main Prediction.
    :param other_col: Other prediction column to calculate exposure dissimilarity against.
    :param corr_method: Correlation method to use for calculating feature exposures.
    corr_method should be one of ['pearson', 'kendall', 'spearman']. Default: 'pearson'.
    """
    assert corr_method in ["pearson", "kendall", "spearman"], f"corr_method should be one of ['pearson', 'kendall', 'spearman']. Got: '{corr_method}'"
    feature_cols = [col for col in dataf.columns if col.startswith("feature")]
    U = dataf[feature_cols].corrwith(dataf[pred_col], method=corr_method).values
    E = dataf[feature_cols].corrwith(dataf[other_col], method=corr_method).values

    denominator = np.dot(E, E)
    if denominator == 0:
        exp_dis = 0
    else:
        exp_dis = 1 - np.dot(U, E) / denominator
    return exp_dis

feature_neutral_mean_std_sharpe(dataf, pred_col, target_col, feature_names)

Feature neutralized mean performance. More info: https://docs.numer.ai/tournament/feature-neutral-correlation

Source code in numerblox/evaluation.py
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
def feature_neutral_mean_std_sharpe(
    self, dataf: pd.DataFrame, pred_col: str, target_col: str, feature_names: list
) -> Tuple[np.float64, np.float64, np.float64]:
    """
    Feature neutralized mean performance.
    More info: https://docs.numer.ai/tournament/feature-neutral-correlation
    """
    fn = FeatureNeutralizer(pred_name=pred_col, proportion=1.0)
    fn.set_predict_request(features=True, era_series=True)
    neutralized_preds = fn.predict(
        dataf[pred_col], features=dataf[feature_names], era_series=dataf[self.era_col]
    )
    # Construct new DataFrame with era col, target col and preds
    neutralized_dataf = pd.DataFrame(columns=[self.era_col, target_col, pred_col])
    neutralized_dataf[self.era_col] = dataf[self.era_col]
    neutralized_dataf[target_col] = dataf[target_col]
    neutralized_dataf[pred_col] = neutralized_preds

    neutral_corrs = self.per_era_numerai_corrs(
        dataf=neutralized_dataf,
        pred_col=pred_col,
        target_col=target_col,
    )
    mean, std, sharpe = self.mean_std_sharpe(era_corrs=neutral_corrs)
    return mean, std, sharpe

full_evaluation(dataf, pred_cols, target_col='target', benchmark_cols=None)

Perform evaluation for each prediction column in pred_cols. By default only the "prediction" column is evaluated. Evaluation is done against given target and benchmark prediction column. :param dataf: DataFrame containing era_col, pred_cols, target_col and optional benchmark_cols. :param pred_cols: List of prediction columns to calculate evaluation metrics for. :param target_col: Target column to evaluate against. :param benchmark_cols: Optional list of benchmark columns to calculate evaluation metrics for.

Source code in numerblox/evaluation.py
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
def full_evaluation(
    self,
    dataf: pd.DataFrame,
    pred_cols: List[str],
    target_col: str = "target",
    benchmark_cols: list = None,
) -> pd.DataFrame:
    """
    Perform evaluation for each prediction column in pred_cols.
    By default only the "prediction" column is evaluated.
    Evaluation is done against given target and benchmark prediction column.
    :param dataf: DataFrame containing era_col, pred_cols, target_col and optional benchmark_cols.
    :param pred_cols: List of prediction columns to calculate evaluation metrics for.
    :param target_col: Target column to evaluate against.
    :param benchmark_cols: Optional list of benchmark columns to calculate evaluation metrics for.
    """
    val_stats = pd.DataFrame()
    feature_cols = [col for col in dataf.columns if col.startswith("feature")]
    cat_cols = (
        dataf[feature_cols].select_dtypes(include=["category"]).columns.to_list()
    )
    if cat_cols:
        print(
            f"WARNING: Categorical features detected that cannot be used for neutralization. Removing columns: '{cat_cols}' for evaluation."
        )
        dataf.loc[:, feature_cols] = dataf[feature_cols].select_dtypes(
            exclude=["category"]
        )
    dataf = dataf.fillna(0.5)
    for col in tqdm(pred_cols, desc="Evaluation: "):
        col_stats = self.evaluation_one_col(
            dataf=dataf,
            pred_col=col,
            feature_cols=feature_cols,
            target_col=target_col,
            benchmark_cols=benchmark_cols,
        )
        val_stats = pd.concat([val_stats, col_stats], axis=0)
    return val_stats

get_feature_exposures_corrv2(dataf, pred_col, feature_list, cpu_cores=-1)

Calculate feature exposures for each era using 'Numerai Corr'. Results will be similar to get_feature_exposures() but more accurate. This method will take longer to compute.

:param dataf: DataFrame containing predictions, features, and eras. :param pred_col: Prediction column to calculate feature exposures for. :param feature_list: List of feature columns in X. :param cpu_cores: Number of CPU cores to use for parallelization. Default: -1 (all cores). :return: DataFrame with Corrv2 feature exposures by era for each feature.

Source code in numerblox/evaluation.py
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
def get_feature_exposures_corrv2(
    self,
    dataf: pd.DataFrame,
    pred_col: str,
    feature_list: List[str],
    cpu_cores: int = -1,
) -> pd.DataFrame:
    """
    Calculate feature exposures for each era using 'Numerai Corr'.
    Results will be similar to get_feature_exposures() but more accurate.
    This method will take longer to compute.

    :param dataf: DataFrame containing predictions, features, and eras.
    :param pred_col: Prediction column to calculate feature exposures for.
    :param feature_list: List of feature columns in X.
    :param cpu_cores: Number of CPU cores to use for parallelization.
    Default: -1 (all cores).
    :return: DataFrame with Corrv2 feature exposures by era for each feature.
    """

    def calculate_era_feature_exposure(era, group, pred_col, feature_list):
        exposures = {}
        for feature in feature_list:
            corr = self.numerai_corr(
                group, pred_col=f"{pred_col}_normalized", target_col=feature
            )
            exposures[feature] = corr
        return era, exposures

    normalized_ranks = (dataf[[pred_col]].rank(method="first") - 0.5) / len(dataf)
    dataf[f"{pred_col}_normalized"] = stats.norm.ppf(normalized_ranks)
    feature_exposure_data = pd.DataFrame(
        index=dataf["era"].unique(), columns=feature_list
    )

    grouped_data = list(dataf.groupby("era"))

    results = Parallel(n_jobs=cpu_cores)(
        delayed(calculate_era_feature_exposure)(era, group, pred_col, feature_list)
        for era, group in grouped_data
    )
    for era, exposures in results:
        feature_exposure_data.loc[era, :] = exposures
    return feature_exposure_data

get_feature_exposures_pearson(dataf, pred_col, feature_list, cpu_cores=-1)

Calculate feature exposures for each era using Pearson correlation.

:param dataf: DataFrame containing predictions, features, and eras. :param pred_col: Prediction column to calculate feature exposures for. :param feature_list: List of feature columns in X. :param cpu_cores: Number of CPU cores to use for parallelization. :return: DataFrame with Pearson feature exposures by era for each feature.

Source code in numerblox/evaluation.py
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
def get_feature_exposures_pearson(
    self,
    dataf: pd.DataFrame,
    pred_col: str,
    feature_list: List[str],
    cpu_cores: int = -1,
) -> pd.DataFrame:
    """
    Calculate feature exposures for each era using Pearson correlation.

    :param dataf: DataFrame containing predictions, features, and eras.
    :param pred_col: Prediction column to calculate feature exposures for.
    :param feature_list: List of feature columns in X.
    :param cpu_cores: Number of CPU cores to use for parallelization.
    :return: DataFrame with Pearson feature exposures by era for each feature.
    """

    def calculate_era_pearson_exposure(
        era, group, feature_list, pred_col_normalized
    ):
        data_matrix = group[feature_list + [pred_col_normalized]].values
        correlations = np.corrcoef(data_matrix, rowvar=False)

        # Get the correlations of all features with the predictions (which is the last column)
        feature_correlations = correlations[:-1, -1]
        return era, feature_correlations

    normalized_ranks = (dataf[[pred_col]].rank(method="first") - 0.5) / len(dataf)
    dataf[f"{pred_col}_normalized"] = stats.norm.ppf(normalized_ranks)
    feature_exposure_data = pd.DataFrame(
        index=dataf["era"].unique(), columns=feature_list
    )

    grouped_data = list(dataf.groupby("era"))

    results = Parallel(n_jobs=cpu_cores)(
        delayed(calculate_era_pearson_exposure)(
            era, group, feature_list, f"{pred_col}_normalized"
        )
        for era, group in grouped_data
    )

    for era, feature_correlations in results:
        feature_exposure_data.loc[era, :] = feature_correlations
    return feature_exposure_data

legacy_contribution(dataf, pred_col, target_col, other_col)

Legacy contibution mean, standard deviation and sharpe ratio. More info: https://forum.numer.ai/t/mmc2-announcement/93

:param dataf: DataFrame containing era_col, pred_col, target_col and other_col. :param pred_col: Prediction column to calculate MMC for. :param target_col: Target column to calculate MMC against. :param other_col: Meta model column containing predictions to neutralize against.

:return: List of legacy contribution scores by era.

Source code in numerblox/evaluation.py
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
def legacy_contribution(
    self, dataf: pd.DataFrame, pred_col: str, target_col: str, other_col: str
):
    """
    Legacy contibution mean, standard deviation and sharpe ratio.
    More info: https://forum.numer.ai/t/mmc2-announcement/93

    :param dataf: DataFrame containing era_col, pred_col, target_col and other_col.
    :param pred_col: Prediction column to calculate MMC for.
    :param target_col: Target column to calculate MMC against.
    :param other_col: Meta model column containing predictions to neutralize against.

    :return: List of legacy contribution scores by era.
    """
    legacy_mc_scores = []
    # Standard deviation of a uniform distribution
    COVARIANCE_FACTOR = 0.29**2
    # Calculate MMC for each era
    for _, x in dataf.groupby(self.era_col):
        series = self._neutralize_series(
            self._normalize_uniform(x[pred_col]), (x[other_col])
        )
        legacy_mc_scores.append(
            np.cov(series, x[target_col])[0, 1] / COVARIANCE_FACTOR
        )

    return legacy_mc_scores

max_drawdown(era_corrs) staticmethod

Maximum drawdown per era.

Source code in numerblox/evaluation.py
455
456
457
458
459
460
461
462
463
464
@staticmethod
def max_drawdown(era_corrs: pd.Series) -> np.float64:
    """Maximum drawdown per era."""
    # Arbitrarily large window
    rolling_max = (
        (era_corrs + 1).cumprod().rolling(window=9000, min_periods=1).max()
    )
    daily_value = (era_corrs + 1).cumprod()
    max_drawdown = -((rolling_max - daily_value) / rolling_max).max()
    return max_drawdown

max_feature_exposure(dataf, feature_cols, pred_col)

Maximum exposure over all features.

Source code in numerblox/evaluation.py
500
501
502
503
504
505
506
507
508
def max_feature_exposure(
    self, dataf: pd.DataFrame, feature_cols: List[str], pred_col: str
) -> np.float64:
    """Maximum exposure over all features."""
    max_per_era = dataf.groupby(self.era_col).apply(
        lambda d: d[feature_cols].corrwith(d[pred_col]).abs().max()
    )
    max_feature_exposure = max_per_era.mean(skipna=True)
    return max_feature_exposure

mean_std_sharpe(era_corrs)

Average, standard deviation and Sharpe ratio for correlations per era.

Source code in numerblox/evaluation.py
419
420
421
422
423
424
425
426
427
428
429
def mean_std_sharpe(
    self, era_corrs: pd.Series
) -> Tuple[np.float64, np.float64, np.float64]:
    """
    Average, standard deviation and Sharpe ratio for
    correlations per era.
    """
    mean = pd.Series(era_corrs.mean()).item()
    std = pd.Series(era_corrs.std(ddof=0)).item()
    sharpe = np.nan if std == 0 else mean / std
    return mean, std, sharpe

numerai_corr(dataf, pred_col, target_col)

Computes 'Numerai Corr' aka 'Corrv2'. More info: https://forum.numer.ai/t/target-cyrus-new-primary-target/6303

Assumes original target col as input (i.e. in [0, 1] range).

Source code in numerblox/evaluation.py
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
def numerai_corr(
    self, dataf: pd.DataFrame, pred_col: str, target_col: str
) -> np.float64:
    """
    Computes 'Numerai Corr' aka 'Corrv2'.
    More info: https://forum.numer.ai/t/target-cyrus-new-primary-target/6303

    Assumes original target col as input (i.e. in [0, 1] range).
    """
    # Rank and gaussianize predictions
    ranked_preds = self._normalize_uniform(
        dataf[pred_col].fillna(0.5), method="average"
    )
    gauss_ranked_preds = stats.norm.ppf(ranked_preds)
    # Center target from [0...1] to [-0.5...0.5] range
    targets = dataf[target_col]
    centered_target = targets - targets.mean()
    # Accentuate tails of predictions and targets
    preds_p15 = np.sign(gauss_ranked_preds) * np.abs(gauss_ranked_preds) ** 1.5
    target_p15 = np.sign(centered_target) * np.abs(centered_target) ** 1.5
    # Pearson correlation
    corr, _ = stats.pearsonr(preds_p15, target_p15)
    return corr

per_era_corrs(dataf, pred_col, target_col)

Correlation between prediction and target for each era.

Source code in numerblox/evaluation.py
401
402
403
404
405
406
407
408
409
def per_era_corrs(
    self, dataf: pd.DataFrame, pred_col: str, target_col: str
) -> pd.Series:
    """Correlation between prediction and target for each era."""
    return dataf.groupby(self.era_col).apply(
        lambda d: self._normalize_uniform(d[pred_col].fillna(0.5)).corr(
            d[target_col]
        )
    )

per_era_numerai_corrs(dataf, pred_col, target_col)

Numerai Corr between prediction and target for each era.

Source code in numerblox/evaluation.py
411
412
413
414
415
416
417
def per_era_numerai_corrs(
    self, dataf: pd.DataFrame, pred_col: str, target_col: str
) -> pd.Series:
    """Numerai Corr between prediction and target for each era."""
    return dataf.groupby(self.era_col).apply(
        lambda d: self.numerai_corr(d.fillna(0.5), pred_col, target_col)
    )

plot_correlations(dataf, pred_cols, corr_cols=None, target_col='target', roll_mean=20)

Plot per era correlations over time. :param dataf: DataFrame that contains at least all pred_cols, target_col and corr_cols. :param pred_cols: List of prediction columns to calculate per era correlations for and plot. :param corr_cols: Per era correlations already prepared to include in the plot. This is optional for if you already have per era correlations prepared in your input dataf. :param target_col: Target column name to compute per era correlations against. :param roll_mean: How many eras should be averaged to compute a rolling score.

Source code in numerblox/evaluation.py
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
def plot_correlations(
    self,
    dataf: pd.DataFrame,
    pred_cols: List[str],
    corr_cols: list = None,
    target_col: str = "target",
    roll_mean: int = 20,
):
    """
    Plot per era correlations over time.
    :param dataf: DataFrame that contains at least all pred_cols, target_col and corr_cols.
    :param pred_cols: List of prediction columns to calculate per era correlations for and plot.
    :param corr_cols: Per era correlations already prepared to include in the plot.
    This is optional for if you already have per era correlations prepared in your input dataf.
    :param target_col: Target column name to compute per era correlations against.
    :param roll_mean: How many eras should be averaged to compute a rolling score.
    """
    validation_by_eras = pd.DataFrame()
    # Compute per era correlation for each prediction column.
    for pred_col in pred_cols:
        per_era_corrs = self.per_era_numerai_corrs(
            dataf, pred_col=pred_col, target_col=target_col
        )
        validation_by_eras.loc[:, pred_col] = per_era_corrs

    # Add prepared per era correlation if any.
    if corr_cols is not None:
        for corr_col in corr_cols:
            validation_by_eras.loc[:, corr_col] = dataf[corr_col]

    validation_by_eras.rolling(roll_mean).mean().plot(
        kind="line",
        marker="o",
        ms=4,
        title=f"Rolling Per Era Correlation Mean (rolling window size: {roll_mean})",
        figsize=(15, 5),
    )
    plt.legend(
        loc="upper center",
        bbox_to_anchor=(0.5, -0.05),
        fancybox=True,
        shadow=True,
        ncol=1,
    )
    plt.axhline(y=0.0, color="r", linestyle="--")
    plt.show()

    validation_by_eras.cumsum().plot(
        title="Cumulative Sum of Era Correlations", figsize=(15, 5)
    )
    plt.legend(
        loc="upper center",
        bbox_to_anchor=(0.5, -0.05),
        fancybox=True,
        shadow=True,
        ncol=1,
    )
    plt.axhline(y=0.0, color="r", linestyle="--")
    plt.show()
    return

smart_sharpe(era_corrs)

Sharpe adjusted for autocorrelation. :param era_corrs: Correlation scores by era

Source code in numerblox/evaluation.py
749
750
751
752
753
754
755
756
def smart_sharpe(self, era_corrs: pd.Series) -> np.float64:
    """
    Sharpe adjusted for autocorrelation.
    :param era_corrs: Correlation scores by era
    """
    return np.nanmean(era_corrs) / (
        np.nanstd(era_corrs, ddof=1) * self.autocorr_penalty(era_corrs)
    )

tbx_mean_std_sharpe(dataf, pred_col, target_col, tb=200)

Calculate Mean, Standard deviation and Sharpe ratio when we focus on the x top and x bottom predictions. :param tb: How many of top and bottom predictions to focus on. TB200 and TB500 are the most common situations.

Source code in numerblox/evaluation.py
536
537
538
539
540
541
542
543
544
545
546
547
548
def tbx_mean_std_sharpe(
    self, dataf: pd.DataFrame, pred_col: str, target_col: str, tb: int = 200
) -> Tuple[np.float64, np.float64, np.float64]:
    """
    Calculate Mean, Standard deviation and Sharpe ratio
    when we focus on the x top and x bottom predictions.
    :param tb: How many of top and bottom predictions to focus on.
    TB200 and TB500 are the most common situations.
    """
    tb_val_corrs = self._score_by_date(
        dataf=dataf, columns=[pred_col], target=target_col, tb=tb
    )
    return self.mean_std_sharpe(era_corrs=tb_val_corrs)

NumeraiClassicEvaluator

Bases: BaseEvaluator

Evaluator for all metrics that are relevant in Numerai Classic.

Source code in numerblox/evaluation.py
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
class NumeraiClassicEvaluator(BaseEvaluator):
    """
    Evaluator for all metrics that are relevant in Numerai Classic.
    """

    def __init__(
        self,
        era_col: str = "era",
        metrics_list: List[str] = FAST_METRICS,
        custom_functions: Dict[str, Dict[str, Any]] = None,
        show_detailed_progress_bar: bool = True,
    ):
        for metric in metrics_list:
            assert (
                metric in ALL_CLASSIC_METRICS
            ), f"Metric '{metric}' not found. Valid metrics: {ALL_CLASSIC_METRICS}."
        super().__init__(
            era_col=era_col, metrics_list=metrics_list, custom_functions=custom_functions,
            show_detailed_progress_bar=show_detailed_progress_bar
        )
        self.fncv3_features = FNCV3_FEATURES

    def full_evaluation(
        self,
        dataf: pd.DataFrame,
        pred_cols: List[str],
        target_col: str = "target",
        benchmark_cols: list = None,
    ) -> pd.DataFrame:
        val_stats = pd.DataFrame()
        dataf = dataf.fillna(0.5)
        feature_cols = [col for col in dataf.columns if col.startswith("feature")]

        # Check if sufficient columns are present in dataf to compute FNC
        feature_set = set(dataf.columns)
        if set(self.fncv3_features).issubset(feature_set):
            print(
                "Using 'v4.2/features.json/fncv3_features' feature set to calculate FNC metrics."
            )
            valid_features = self.fncv3_features
        else:
            print(
                "WARNING: No suitable feature set defined for FNC. Skipping calculation of FNC."
            )
            valid_features = []

        with tqdm(pred_cols, desc="Evaluation") as pbar:
            for col in pbar:
                # Metrics that can be calculated for both Numerai Classic and Signals
                col_stats = self.evaluation_one_col(
                    dataf=dataf,
                    feature_cols=feature_cols,
                    pred_col=col,
                    target_col=target_col,
                    benchmark_cols=benchmark_cols,
                )
                # Numerai Classic specific metrics
                if valid_features and "fncv3_mean_std_sharpe" in self.metrics_list:
                    pbar.set_description_str(f"fncv3_mean_std_sharpe for evaluation of '{col}'")
                    # Using only valid features defined in FNCV3_FEATURES
                    fnc_v3, fn_std_v3, fn_sharpe_v3 = self.feature_neutral_mean_std_sharpe(
                        dataf=dataf,
                        pred_col=col,
                        target_col=target_col,
                        feature_names=valid_features,
                    )
                    col_stats.loc[col, "feature_neutral_mean_v3"] = fnc_v3
                    col_stats.loc[col, "feature_neutral_std_v3"] = fn_std_v3
                    col_stats.loc[col, "feature_neutral_sharpe_v3"] = fn_sharpe_v3

                val_stats = pd.concat([val_stats, col_stats], axis=0)
        return val_stats

NumeraiSignalsEvaluator

Bases: BaseEvaluator

Evaluator for all metrics that are relevant in Numerai Signals.

Source code in numerblox/evaluation.py
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
class NumeraiSignalsEvaluator(BaseEvaluator):
    """Evaluator for all metrics that are relevant in Numerai Signals."""
    # Columns retrievable from Numerai Signals diagnostics.
    # More info: https://forum.numer.ai/t/signals-diagnostics-guide/5950
    VALID_DIAGNOSTICS_COLS = ["validationCorrV4", "validationFncV4", "validationIcV2", "validationRic"]

    def __init__(
        self,
        era_col: str = "date",
        metrics_list: List[str] = FAST_METRICS,
        custom_functions: Dict[str, Dict[str, Any]] = None,
        show_detailed_progress_bar: bool = True,
    ):
        for metric in metrics_list:
            assert (
                metric in ALL_SIGNALS_METRICS
            ), f"Metric '{metric}' not found. Valid metrics: {ALL_SIGNALS_METRICS}."
        super().__init__(
            era_col=era_col, metrics_list=metrics_list, custom_functions=custom_functions,
            show_detailed_progress_bar=show_detailed_progress_bar
        )

    def get_diagnostics(
        self, val_dataf: pd.DataFrame, model_name: str, key: Key, timeout_min: int = 2,
        col: Union[str, None] = "validationFncV4"
    ) -> pd.DataFrame:
        """
        Retrieved neutralized validation correlation by era. \n
        Calculated on Numerai servers. \n
        :param val_dataf: A DataFrame containing prediction, date, ticker and data_type columns. \n
        data_type column should contain 'validation' instances. \n
        :param model_name: Any model name for which you have authentication credentials. \n
        :param key: Key object to authenticate upload of diagnostics. \n
        :param timeout_min: How many minutes to wait on diagnostics Computing on Numerai servers before timing out. \n
        :param col: Which column to return. Should be one of ['validationCorrV4', 'validationFncV4', 'validationIcV2', 'validationRic']. If None, all columns will be returned. \n
        2 minutes by default. \n
        :return: Pandas Series with era as index and neutralized validation correlations (validationCorr).
        """
        assert col in self.VALID_DIAGNOSTICS_COLS or col is None, f"corr_col should be one of {self.VALID_DIAGNOSTICS_COLS} or None. Got: '{col}'"
        api = SignalsAPI(public_id=key.pub_id, secret_key=key.secret_key)
        model_id = api.get_models()[model_name]
        diagnostics_id = api.upload_diagnostics(df=val_dataf, model_id=model_id)
        data = self.__await_diagnostics(
            api=api,
            model_id=model_id,
            diagnostics_id=diagnostics_id,
            timeout_min=timeout_min,
        )
        diagnostics_df = pd.DataFrame(data["perEraDiagnostics"]).set_index("era")
        diagnostics_df.index = pd.to_datetime(diagnostics_df.index)
        return_cols = [col] if col is not None else self.VALID_DIAGNOSTICS_COLS
        return diagnostics_df[return_cols]

    @staticmethod
    def __await_diagnostics(
        api: SignalsAPI,
        model_id: str,
        diagnostics_id: str,
        timeout_min: int,
        interval_sec: int = 15,
    ):
        """
        Wait for diagnostics to be uploaded.
        Try every 'interval_sec' seconds until 'timeout_min' minutes have passed.
        """
        timeout = time.time() + 60 * timeout_min
        data = {"status": "not_done"}
        while time.time() < timeout:
            data = api.diagnostics(model_id=model_id, diagnostics_id=diagnostics_id)[0]
            if not data["status"] == "done":
                print(
                    f"Diagnostics not processed yet. Sleeping for another {interval_sec} seconds."
                )
                time.sleep(interval_sec)
            else:
                break
        if not data["status"] == "done":
            raise Exception(
                f"Diagnostics couldn't be retrieved within {timeout_min} minutes after uploading. Check if Numerai API is offline."
            )
        return data

__await_diagnostics(api, model_id, diagnostics_id, timeout_min, interval_sec=15) staticmethod

Wait for diagnostics to be uploaded. Try every 'interval_sec' seconds until 'timeout_min' minutes have passed.

Source code in numerblox/evaluation.py
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
@staticmethod
def __await_diagnostics(
    api: SignalsAPI,
    model_id: str,
    diagnostics_id: str,
    timeout_min: int,
    interval_sec: int = 15,
):
    """
    Wait for diagnostics to be uploaded.
    Try every 'interval_sec' seconds until 'timeout_min' minutes have passed.
    """
    timeout = time.time() + 60 * timeout_min
    data = {"status": "not_done"}
    while time.time() < timeout:
        data = api.diagnostics(model_id=model_id, diagnostics_id=diagnostics_id)[0]
        if not data["status"] == "done":
            print(
                f"Diagnostics not processed yet. Sleeping for another {interval_sec} seconds."
            )
            time.sleep(interval_sec)
        else:
            break
    if not data["status"] == "done":
        raise Exception(
            f"Diagnostics couldn't be retrieved within {timeout_min} minutes after uploading. Check if Numerai API is offline."
        )
    return data

get_diagnostics(val_dataf, model_name, key, timeout_min=2, col='validationFncV4')

Retrieved neutralized validation correlation by era.

Calculated on Numerai servers.

:param val_dataf: A DataFrame containing prediction, date, ticker and data_type columns.

data_type column should contain 'validation' instances.

:param model_name: Any model name for which you have authentication credentials.

:param key: Key object to authenticate upload of diagnostics.

:param timeout_min: How many minutes to wait on diagnostics Computing on Numerai servers before timing out.

:param col: Which column to return. Should be one of ['validationCorrV4', 'validationFncV4', 'validationIcV2', 'validationRic']. If None, all columns will be returned.

2 minutes by default.

:return: Pandas Series with era as index and neutralized validation correlations (validationCorr).

Source code in numerblox/evaluation.py
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
def get_diagnostics(
    self, val_dataf: pd.DataFrame, model_name: str, key: Key, timeout_min: int = 2,
    col: Union[str, None] = "validationFncV4"
) -> pd.DataFrame:
    """
    Retrieved neutralized validation correlation by era. \n
    Calculated on Numerai servers. \n
    :param val_dataf: A DataFrame containing prediction, date, ticker and data_type columns. \n
    data_type column should contain 'validation' instances. \n
    :param model_name: Any model name for which you have authentication credentials. \n
    :param key: Key object to authenticate upload of diagnostics. \n
    :param timeout_min: How many minutes to wait on diagnostics Computing on Numerai servers before timing out. \n
    :param col: Which column to return. Should be one of ['validationCorrV4', 'validationFncV4', 'validationIcV2', 'validationRic']. If None, all columns will be returned. \n
    2 minutes by default. \n
    :return: Pandas Series with era as index and neutralized validation correlations (validationCorr).
    """
    assert col in self.VALID_DIAGNOSTICS_COLS or col is None, f"corr_col should be one of {self.VALID_DIAGNOSTICS_COLS} or None. Got: '{col}'"
    api = SignalsAPI(public_id=key.pub_id, secret_key=key.secret_key)
    model_id = api.get_models()[model_name]
    diagnostics_id = api.upload_diagnostics(df=val_dataf, model_id=model_id)
    data = self.__await_diagnostics(
        api=api,
        model_id=model_id,
        diagnostics_id=diagnostics_id,
        timeout_min=timeout_min,
    )
    diagnostics_df = pd.DataFrame(data["perEraDiagnostics"]).set_index("era")
    diagnostics_df.index = pd.to_datetime(diagnostics_df.index)
    return_cols = [col] if col is not None else self.VALID_DIAGNOSTICS_COLS
    return diagnostics_df[return_cols]

BaseSubmitter

Bases: BaseIO

Basic functionality for submitting to Numerai. Uses numerapi under the hood. More info: https://numerapi.readthedocs.io/

:param directory_path: Directory to store and read submissions from. :param api: NumerAPI or SignalsAPI :param max_retries: Maximum number of retries for uploading predictions to Numerai. :param sleep_time: Time to sleep between uploading retries. :param fail_silently: Whether to skip uploading to Numerai without raising an error. Useful for if you are uploading many models in a loop and want to skip models that fail to upload.

Source code in numerblox/submission.py
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
class BaseSubmitter(BaseIO):
    """
    Basic functionality for submitting to Numerai. 
    Uses numerapi under the hood.
    More info: https://numerapi.readthedocs.io/ 

    :param directory_path: Directory to store and read submissions from. 
    :param api: NumerAPI or SignalsAPI
    :param max_retries: Maximum number of retries for uploading predictions to Numerai. 
    :param sleep_time: Time to sleep between uploading retries.
    :param fail_silently: Whether to skip uploading to Numerai without raising an error. 
    Useful for if you are uploading many models in a loop and want to skip models that fail to upload.
    """
    def __init__(self, directory_path: str, api: Union[NumerAPI, SignalsAPI], max_retries: int, 
                 sleep_time: int, fail_silently: bool):
        super().__init__(directory_path)
        self.api = api
        self.max_retries = max_retries
        self.sleep_time = sleep_time
        self.fail_silently = fail_silently

    @abstractmethod
    def save_csv(
        self,
        dataf: pd.DataFrame,
        file_name: str,
        cols: Union[str, list],
        *args,
        **kwargs,
    ):
        """
        For Numerai Classic: Save index column + 'cols' (targets) to CSV.
        For Numerai Signals: Save ticker, date, data_type and signal columns to CSV.
        """
        ...

    def upload_predictions(self, file_name: str, model_name: str, *args, **kwargs):
        """
        Upload CSV file to Numerai for given model name.
        :param file_name: File name/path relative to directory_path.
        :param model_name: Lowercase raw model name (For example, 'integration_test').
        """
        full_path = str(self.dir / file_name)
        model_id = self._get_model_id(model_name=model_name)
        api_type = str(self.api.__class__.__name__)
        print(
            f"{api_type}: Uploading predictions from '{full_path}' for model '{model_name}' (model_id='{model_id}')"
        )
        for attempt in range(self.max_retries):
            try:
                self.api.upload_predictions(
                    file_path=full_path, model_id=model_id, *args, **kwargs
                )
                print(
                    f"{api_type} submission of '{full_path}' for '{model_name}' is successful!"
                )
                return
            except Exception as e:
                if attempt < self.max_retries - 1:  # i.e. not the last attempt
                    print(f"Failed to upload '{full_path}' for '{model_name}' to Numerai. Retrying in {self.sleep_time} seconds...")
                    print(f"Error: {e}")
                    time.sleep(self.sleep_time)
                else:
                    if self.fail_silently:
                        print(f"Failed to upload'{full_path}' for '{model_name}' to Numerai. Skipping...")
                        print(f"Error: {e}")
                    else:
                        print(f"Failed to upload '{full_path}' for '{model_name}' to Numerai after {self.max_retries} attempts.")
                        raise e

    def full_submission(
        self,
        dataf: pd.DataFrame,
        model_name: str,
        cols: Union[str, list],
        file_name: str = 'submission.csv',
        *args,
        **kwargs,
    ):
        """
        Save DataFrame to csv and upload predictions through API.

        :param dataf: Main DataFrame containing `cols`.
        :param model_name: Lowercase Numerai model name.
        :param file_name: path to save model to relative to base directory.
        :param cols: Columns to be saved in submission file.
        1 prediction column for Numerai Classic.
        At least 1 prediction column and 1 ticker column for Numerai Signals.
        *args, **kwargs are passed to numerapi API.
        For example `version` argument in Numerai Classic submissions.
        """
        self.save_csv(dataf=dataf, file_name=file_name, cols=cols)
        self.upload_predictions(
            file_name=file_name, model_name=model_name,
            *args, **kwargs
        )

    def combine_csvs(self, csv_paths: list,
                     aux_cols: list,
                     era_col: str = None,
                     pred_col: str = 'prediction') -> pd.DataFrame:
        """
        Read in csv files and combine all predictions with a rank mean. \n
        Multi-target predictions will be averaged out. \n
        :param csv_paths: List of full paths to .csv prediction files. \n
        :param aux_cols: ['id'] for Numerai Classic. \n
        ['ticker', 'last_friday', 'data_type'], for example, with Numerai Signals. \n
        :param era_col: Column indicating era ('era' or 'last_friday'). \n
        Will be used for Grouping the rank mean if given. Skip groupby if no era_col provided. \n
        :param pred_col: 'prediction' for Numerai Classic and 'signal' for Numerai Signals.
        """
        all_datafs = [pd.read_csv(path, index_col=aux_cols) for path in tqdm(csv_paths)]
        final_dataf = pd.concat(all_datafs, axis="columns")
        # Remove issue of duplicate columns
        numeric_cols = final_dataf.select_dtypes(include=np.number).columns
        final_dataf.rename({k: str(v) for k, v in zip(numeric_cols, range(len(numeric_cols)))},
                           axis=1,
                           inplace=True)
        # Combine all numeric columns with rank mean
        num_dataf = final_dataf.select_dtypes(include=np.number)
        num_dataf = num_dataf.groupby(era_col) if era_col else num_dataf
        final_dataf[pred_col] = num_dataf.rank(pct=True, method="first").mean(axis=1)
        return final_dataf[[pred_col]]

    def _get_model_id(self, model_name: str) -> str:
        """
        Get ID needed for prediction uploading.
        :param model_name: Raw lowercase model name
        of Numerai model that you have access to.
        """
        return self.get_model_mapping[model_name]

    @property
    def get_model_mapping(self) -> dict:
        """Mapping between raw model names and model IDs."""
        return self.api.get_models()

    def _check_value_range(self, dataf: pd.DataFrame, cols: Union[str, list]):
        """ Check if all predictions are in range (0...1). """
        cols = [cols] if isinstance(cols, str) else cols
        for col in cols:
            if not dataf[col].between(0, 1).all():
                min_val, max_val = dataf[col].min(), dataf[col].max()
                raise ValueError(
                    f"Values must be between 0 and 1. \
Found min value of '{min_val}' and max value of '{max_val}' for column '{col}'."
                )

    def __call__(
            self,
            dataf: pd.DataFrame,
            model_name: str,
            file_name: str = "submission.csv",
            cols: Union[str, list] = "prediction",
            *args,
            **kwargs,
    ):
        """
        The most common use case will be to create a CSV and submit it immediately after that.
        full_submission handles this.
        """
        self.full_submission(
            dataf=dataf,
            file_name=file_name,
            model_name=model_name,
            cols=cols,
            *args,
            **kwargs,
        )

get_model_mapping: dict property

Mapping between raw model names and model IDs.

__call__(dataf, model_name, file_name='submission.csv', cols='prediction', *args, **kwargs)

The most common use case will be to create a CSV and submit it immediately after that. full_submission handles this.

Source code in numerblox/submission.py
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
def __call__(
        self,
        dataf: pd.DataFrame,
        model_name: str,
        file_name: str = "submission.csv",
        cols: Union[str, list] = "prediction",
        *args,
        **kwargs,
):
    """
    The most common use case will be to create a CSV and submit it immediately after that.
    full_submission handles this.
    """
    self.full_submission(
        dataf=dataf,
        file_name=file_name,
        model_name=model_name,
        cols=cols,
        *args,
        **kwargs,
    )

combine_csvs(csv_paths, aux_cols, era_col=None, pred_col='prediction')

Read in csv files and combine all predictions with a rank mean.

Multi-target predictions will be averaged out.

:param csv_paths: List of full paths to .csv prediction files.

:param aux_cols: ['id'] for Numerai Classic.

['ticker', 'last_friday', 'data_type'], for example, with Numerai Signals.

:param era_col: Column indicating era ('era' or 'last_friday').

Will be used for Grouping the rank mean if given. Skip groupby if no era_col provided.

:param pred_col: 'prediction' for Numerai Classic and 'signal' for Numerai Signals.

Source code in numerblox/submission.py
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
def combine_csvs(self, csv_paths: list,
                 aux_cols: list,
                 era_col: str = None,
                 pred_col: str = 'prediction') -> pd.DataFrame:
    """
    Read in csv files and combine all predictions with a rank mean. \n
    Multi-target predictions will be averaged out. \n
    :param csv_paths: List of full paths to .csv prediction files. \n
    :param aux_cols: ['id'] for Numerai Classic. \n
    ['ticker', 'last_friday', 'data_type'], for example, with Numerai Signals. \n
    :param era_col: Column indicating era ('era' or 'last_friday'). \n
    Will be used for Grouping the rank mean if given. Skip groupby if no era_col provided. \n
    :param pred_col: 'prediction' for Numerai Classic and 'signal' for Numerai Signals.
    """
    all_datafs = [pd.read_csv(path, index_col=aux_cols) for path in tqdm(csv_paths)]
    final_dataf = pd.concat(all_datafs, axis="columns")
    # Remove issue of duplicate columns
    numeric_cols = final_dataf.select_dtypes(include=np.number).columns
    final_dataf.rename({k: str(v) for k, v in zip(numeric_cols, range(len(numeric_cols)))},
                       axis=1,
                       inplace=True)
    # Combine all numeric columns with rank mean
    num_dataf = final_dataf.select_dtypes(include=np.number)
    num_dataf = num_dataf.groupby(era_col) if era_col else num_dataf
    final_dataf[pred_col] = num_dataf.rank(pct=True, method="first").mean(axis=1)
    return final_dataf[[pred_col]]

full_submission(dataf, model_name, cols, file_name='submission.csv', *args, **kwargs)

Save DataFrame to csv and upload predictions through API.

:param dataf: Main DataFrame containing cols. :param model_name: Lowercase Numerai model name. :param file_name: path to save model to relative to base directory. :param cols: Columns to be saved in submission file. 1 prediction column for Numerai Classic. At least 1 prediction column and 1 ticker column for Numerai Signals. args, *kwargs are passed to numerapi API. For example version argument in Numerai Classic submissions.

Source code in numerblox/submission.py
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
def full_submission(
    self,
    dataf: pd.DataFrame,
    model_name: str,
    cols: Union[str, list],
    file_name: str = 'submission.csv',
    *args,
    **kwargs,
):
    """
    Save DataFrame to csv and upload predictions through API.

    :param dataf: Main DataFrame containing `cols`.
    :param model_name: Lowercase Numerai model name.
    :param file_name: path to save model to relative to base directory.
    :param cols: Columns to be saved in submission file.
    1 prediction column for Numerai Classic.
    At least 1 prediction column and 1 ticker column for Numerai Signals.
    *args, **kwargs are passed to numerapi API.
    For example `version` argument in Numerai Classic submissions.
    """
    self.save_csv(dataf=dataf, file_name=file_name, cols=cols)
    self.upload_predictions(
        file_name=file_name, model_name=model_name,
        *args, **kwargs
    )

save_csv(dataf, file_name, cols, *args, **kwargs) abstractmethod

For Numerai Classic: Save index column + 'cols' (targets) to CSV. For Numerai Signals: Save ticker, date, data_type and signal columns to CSV.

Source code in numerblox/submission.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
@abstractmethod
def save_csv(
    self,
    dataf: pd.DataFrame,
    file_name: str,
    cols: Union[str, list],
    *args,
    **kwargs,
):
    """
    For Numerai Classic: Save index column + 'cols' (targets) to CSV.
    For Numerai Signals: Save ticker, date, data_type and signal columns to CSV.
    """
    ...

upload_predictions(file_name, model_name, *args, **kwargs)

Upload CSV file to Numerai for given model name. :param file_name: File name/path relative to directory_path. :param model_name: Lowercase raw model name (For example, 'integration_test').

Source code in numerblox/submission.py
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def upload_predictions(self, file_name: str, model_name: str, *args, **kwargs):
    """
    Upload CSV file to Numerai for given model name.
    :param file_name: File name/path relative to directory_path.
    :param model_name: Lowercase raw model name (For example, 'integration_test').
    """
    full_path = str(self.dir / file_name)
    model_id = self._get_model_id(model_name=model_name)
    api_type = str(self.api.__class__.__name__)
    print(
        f"{api_type}: Uploading predictions from '{full_path}' for model '{model_name}' (model_id='{model_id}')"
    )
    for attempt in range(self.max_retries):
        try:
            self.api.upload_predictions(
                file_path=full_path, model_id=model_id, *args, **kwargs
            )
            print(
                f"{api_type} submission of '{full_path}' for '{model_name}' is successful!"
            )
            return
        except Exception as e:
            if attempt < self.max_retries - 1:  # i.e. not the last attempt
                print(f"Failed to upload '{full_path}' for '{model_name}' to Numerai. Retrying in {self.sleep_time} seconds...")
                print(f"Error: {e}")
                time.sleep(self.sleep_time)
            else:
                if self.fail_silently:
                    print(f"Failed to upload'{full_path}' for '{model_name}' to Numerai. Skipping...")
                    print(f"Error: {e}")
                else:
                    print(f"Failed to upload '{full_path}' for '{model_name}' to Numerai after {self.max_retries} attempts.")
                    raise e

NumerBaySubmitter

Bases: BaseSubmitter

Submit to NumerBay to fulfill sale orders, in addition to submission to Numerai.

:param tournament_submitter: Base tournament submitter (NumeraiClassicSubmitter or NumeraiSignalsSubmitter). This submitter will use the same directory path. :param upload_to_numerai: Whether to also submit to Numerai using the tournament submitter. Defaults to True, set to False to only upload to NumerBay. :param numerbay_username: NumerBay username :param numerbay_password: NumerBay password

Source code in numerblox/submission.py
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
class NumerBaySubmitter(BaseSubmitter):
    """
    Submit to NumerBay to fulfill sale orders, in addition to submission to Numerai.

    :param tournament_submitter: Base tournament submitter (NumeraiClassicSubmitter or NumeraiSignalsSubmitter). This submitter will use the same directory path.
    :param upload_to_numerai: Whether to also submit to Numerai using the tournament submitter. Defaults to True, set to False to only upload to NumerBay.
    :param numerbay_username: NumerBay username
    :param numerbay_password: NumerBay password
    """
    def __init__(self,
                 tournament_submitter: Union[NumeraiClassicSubmitter, NumeraiSignalsSubmitter],
                 upload_to_numerai: bool = True,
                 numerbay_username: str = None,
                 numerbay_password: str = None):
        super().__init__(
            directory_path=str(tournament_submitter.dir), api=tournament_submitter.api,
            max_retries=tournament_submitter.max_retries, sleep_time=tournament_submitter.sleep_time,
            fail_silently=tournament_submitter.fail_silently
        )
        from numerbay import NumerBay
        self.numerbay_api = NumerBay(username=numerbay_username, password=numerbay_password)
        self.tournament_submitter = tournament_submitter
        self.upload_to_numerai = upload_to_numerai

    def upload_predictions(self,
                           file_name: str,
                           model_name: str,
                           numerbay_product_full_name: str,
                           *args,
                           **kwargs):
        """
        Upload CSV file to NumerBay (and Numerai if 'upload_to_numerai' is True) for given model name and NumerBay product full name.
        :param file_name: File name/path relative to directory_path.
        :param model_name: Lowercase raw model name (For example, 'integration_test').
        :param numerbay_product_full_name: NumerBay product full name in the format of [category]-[product name], e.g. 'numerai-predictions-numerbay'
        """
        if self.upload_to_numerai:
            self.tournament_submitter.upload_predictions(file_name, model_name, *args, **kwargs)

        full_path = str(self.dir / file_name)
        api_type = str(self.numerbay_api.__class__.__name__)
        print(
            f"{api_type}: Uploading predictions from '{full_path}' for NumerBay product '{numerbay_product_full_name}'"
        )
        artifact = self.numerbay_api.upload_artifact(
            str(full_path), product_full_name=numerbay_product_full_name
        )
        if artifact:
            print(
                f"{api_type} submission of '{full_path}' for NumerBay product [bold blue]{numerbay_product_full_name} is successful!"
            )
        else:
            print(f"""WARNING: Upload skipped for NumerBay product '{numerbay_product_full_name}', 
                  the product uses buyer-side encryption but does not have any active sale order to upload for.""")

    def full_submission(
        self,
        dataf: pd.DataFrame,
        model_name: str,
        cols: Union[str, list],
        numerbay_product_full_name: str,
        file_name: str = 'submission.csv',
        *args,
        **kwargs,
    ):
        """
        Save DataFrame to csv and upload predictions through API.

        :param dataf: Main DataFrame containing `cols`.
        :param model_name: Lowercase Numerai model name.
        :param numerbay_product_full_name: NumerBay product full name in the format of [category]-[product name], e.g. 'numerai-predictions-numerbay'
        :param file_name: path to save model to relative to base directory.
        :param cols: Columns to be saved in submission file.
        1 prediction column for Numerai Classic.
        At least 1 prediction column and 1 ticker column for Numerai Signals.
        *args, **kwargs are passed to numerapi API.
        For example `version` argument in Numerai Classic submissions.
        """
        self.save_csv(dataf=dataf, file_name=file_name, cols=cols)
        self.upload_predictions(
            file_name=file_name, model_name=model_name, numerbay_product_full_name=numerbay_product_full_name,
            *args, **kwargs
        )

    def combine_csvs(self, *args,**kwargs) -> pd.DataFrame:
        return self.tournament_submitter.combine_csvs(*args,**kwargs)

    def save_csv(self, *args, **kwargs):
        self.tournament_submitter.save_csv(*args, **kwargs)

    @property
    def get_model_mapping(self) -> dict:
        return self.tournament_submitter.api.get_models()

    def __call__(
            self,
            dataf: pd.DataFrame,
            model_name: str,
            numerbay_product_full_name: str,
            file_name: str = "submission.csv",
            cols: Union[str, list] = "prediction",
            *args,
            **kwargs,
    ):
        """
        The most common use case will be to create a CSV and submit it immediately after that.
        full_submission handles this.
        """
        self.full_submission(
            dataf=dataf,
            file_name=file_name,
            model_name=model_name,
            numerbay_product_full_name=numerbay_product_full_name,
            cols=cols,
            *args,
            **kwargs,
        )

__call__(dataf, model_name, numerbay_product_full_name, file_name='submission.csv', cols='prediction', *args, **kwargs)

The most common use case will be to create a CSV and submit it immediately after that. full_submission handles this.

Source code in numerblox/submission.py
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
def __call__(
        self,
        dataf: pd.DataFrame,
        model_name: str,
        numerbay_product_full_name: str,
        file_name: str = "submission.csv",
        cols: Union[str, list] = "prediction",
        *args,
        **kwargs,
):
    """
    The most common use case will be to create a CSV and submit it immediately after that.
    full_submission handles this.
    """
    self.full_submission(
        dataf=dataf,
        file_name=file_name,
        model_name=model_name,
        numerbay_product_full_name=numerbay_product_full_name,
        cols=cols,
        *args,
        **kwargs,
    )

full_submission(dataf, model_name, cols, numerbay_product_full_name, file_name='submission.csv', *args, **kwargs)

Save DataFrame to csv and upload predictions through API.

:param dataf: Main DataFrame containing cols. :param model_name: Lowercase Numerai model name. :param numerbay_product_full_name: NumerBay product full name in the format of [category]-[product name], e.g. 'numerai-predictions-numerbay' :param file_name: path to save model to relative to base directory. :param cols: Columns to be saved in submission file. 1 prediction column for Numerai Classic. At least 1 prediction column and 1 ticker column for Numerai Signals. args, *kwargs are passed to numerapi API. For example version argument in Numerai Classic submissions.

Source code in numerblox/submission.py
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
def full_submission(
    self,
    dataf: pd.DataFrame,
    model_name: str,
    cols: Union[str, list],
    numerbay_product_full_name: str,
    file_name: str = 'submission.csv',
    *args,
    **kwargs,
):
    """
    Save DataFrame to csv and upload predictions through API.

    :param dataf: Main DataFrame containing `cols`.
    :param model_name: Lowercase Numerai model name.
    :param numerbay_product_full_name: NumerBay product full name in the format of [category]-[product name], e.g. 'numerai-predictions-numerbay'
    :param file_name: path to save model to relative to base directory.
    :param cols: Columns to be saved in submission file.
    1 prediction column for Numerai Classic.
    At least 1 prediction column and 1 ticker column for Numerai Signals.
    *args, **kwargs are passed to numerapi API.
    For example `version` argument in Numerai Classic submissions.
    """
    self.save_csv(dataf=dataf, file_name=file_name, cols=cols)
    self.upload_predictions(
        file_name=file_name, model_name=model_name, numerbay_product_full_name=numerbay_product_full_name,
        *args, **kwargs
    )

upload_predictions(file_name, model_name, numerbay_product_full_name, *args, **kwargs)

Upload CSV file to NumerBay (and Numerai if 'upload_to_numerai' is True) for given model name and NumerBay product full name. :param file_name: File name/path relative to directory_path. :param model_name: Lowercase raw model name (For example, 'integration_test'). :param numerbay_product_full_name: NumerBay product full name in the format of [category]-[product name], e.g. 'numerai-predictions-numerbay'

Source code in numerblox/submission.py
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
def upload_predictions(self,
                       file_name: str,
                       model_name: str,
                       numerbay_product_full_name: str,
                       *args,
                       **kwargs):
    """
    Upload CSV file to NumerBay (and Numerai if 'upload_to_numerai' is True) for given model name and NumerBay product full name.
    :param file_name: File name/path relative to directory_path.
    :param model_name: Lowercase raw model name (For example, 'integration_test').
    :param numerbay_product_full_name: NumerBay product full name in the format of [category]-[product name], e.g. 'numerai-predictions-numerbay'
    """
    if self.upload_to_numerai:
        self.tournament_submitter.upload_predictions(file_name, model_name, *args, **kwargs)

    full_path = str(self.dir / file_name)
    api_type = str(self.numerbay_api.__class__.__name__)
    print(
        f"{api_type}: Uploading predictions from '{full_path}' for NumerBay product '{numerbay_product_full_name}'"
    )
    artifact = self.numerbay_api.upload_artifact(
        str(full_path), product_full_name=numerbay_product_full_name
    )
    if artifact:
        print(
            f"{api_type} submission of '{full_path}' for NumerBay product [bold blue]{numerbay_product_full_name} is successful!"
        )
    else:
        print(f"""WARNING: Upload skipped for NumerBay product '{numerbay_product_full_name}', 
              the product uses buyer-side encryption but does not have any active sale order to upload for.""")

NumeraiClassicSubmitter

Bases: BaseSubmitter

Submit for Numerai Classic.

:param directory_path: Base directory to save and read prediction files from.

:param key: Key object containing valid credentials for Numerai Classic.

:param max_retries: Maximum number of retries for uploading predictions to Numerai. :param sleep_time: Time to sleep between uploading retries. :param fail_silently: Whether to skip uploading to Numerai without raising an error. Useful for if you are uploading many models in a loop and want to skip models that fail to upload. args, *kwargs will be passed to NumerAPI initialization.

Source code in numerblox/submission.py
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
class NumeraiClassicSubmitter(BaseSubmitter):
    """
    Submit for Numerai Classic.

    :param directory_path: Base directory to save and read prediction files from. \n
    :param key: Key object containing valid credentials for Numerai Classic. \n
    :param max_retries: Maximum number of retries for uploading predictions to Numerai. 
    :param sleep_time: Time to sleep between uploading retries.
    :param fail_silently: Whether to skip uploading to Numerai without raising an error. 
    Useful for if you are uploading many models in a loop and want to skip models that fail to upload.
    *args, **kwargs will be passed to NumerAPI initialization.
    """
    def __init__(self, directory_path: str, key: Key, 
                 max_retries: int = 2, sleep_time: int = 10, 
                 fail_silently=False, *args, **kwargs):
        api = NumerAPI(public_id=key.pub_id, secret_key=key.secret_key, *args, **kwargs)
        super().__init__(
            directory_path=directory_path, api=api,
            max_retries=max_retries, sleep_time=sleep_time, 
            fail_silently=fail_silently
        )

    def save_csv(
            self,
            dataf: pd.DataFrame,
            file_name: str = "submission.csv",
            cols: str = 'prediction',
            *args,
            **kwargs,
    ):
        """
        :param dataf: DataFrame which should have at least the following columns:
        1. id (as index column)
        2. cols (for example, 'prediction_mymodel'). Will be saved in 'prediction' column
        :param file_name: .csv file path.
        :param cols: Prediction column name.
        For example, 'prediction' or 'prediction_mymodel'.
        """
        sub_dataf = deepcopy(dataf)
        self._check_value_range(dataf=sub_dataf, cols=cols)

        full_path = str(self.dir / file_name)
        print(
            f"Saving predictions CSV to '{full_path}'."
        )
        sub_dataf.loc[:, 'prediction'] = sub_dataf[cols]
        sub_dataf.loc[:, 'prediction'].to_csv(full_path, *args, **kwargs)

save_csv(dataf, file_name='submission.csv', cols='prediction', *args, **kwargs)

:param dataf: DataFrame which should have at least the following columns: 1. id (as index column) 2. cols (for example, 'prediction_mymodel'). Will be saved in 'prediction' column :param file_name: .csv file path. :param cols: Prediction column name. For example, 'prediction' or 'prediction_mymodel'.

Source code in numerblox/submission.py
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
def save_csv(
        self,
        dataf: pd.DataFrame,
        file_name: str = "submission.csv",
        cols: str = 'prediction',
        *args,
        **kwargs,
):
    """
    :param dataf: DataFrame which should have at least the following columns:
    1. id (as index column)
    2. cols (for example, 'prediction_mymodel'). Will be saved in 'prediction' column
    :param file_name: .csv file path.
    :param cols: Prediction column name.
    For example, 'prediction' or 'prediction_mymodel'.
    """
    sub_dataf = deepcopy(dataf)
    self._check_value_range(dataf=sub_dataf, cols=cols)

    full_path = str(self.dir / file_name)
    print(
        f"Saving predictions CSV to '{full_path}'."
    )
    sub_dataf.loc[:, 'prediction'] = sub_dataf[cols]
    sub_dataf.loc[:, 'prediction'].to_csv(full_path, *args, **kwargs)

NumeraiSignalsSubmitter

Bases: BaseSubmitter

Submit for Numerai Signals.

:param directory_path: Base directory to save and read prediction files from.

:param key: Key object containing valid credentials for Numerai Signals.

:param max_retries: Maximum number of retries for uploading predictions to Numerai. :param sleep_time: Time to sleep between uploading retries. :param fail_silently: Whether to skip uploading to Numerai without raising an error. Useful for if you are uploading many models in a loop and want to skip models that fail to upload. args, *kwargs will be passed to SignalsAPI initialization.

Source code in numerblox/submission.py
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
class NumeraiSignalsSubmitter(BaseSubmitter):
    """
    Submit for Numerai Signals.

    :param directory_path: Base directory to save and read prediction files from. \n
    :param key: Key object containing valid credentials for Numerai Signals. \n
    :param max_retries: Maximum number of retries for uploading predictions to Numerai. 
    :param sleep_time: Time to sleep between uploading retries.
    :param fail_silently: Whether to skip uploading to Numerai without raising an error. 
    Useful for if you are uploading many models in a loop and want to skip models that fail to upload.
    *args, **kwargs will be passed to SignalsAPI initialization.
    """
    def __init__(self, directory_path: str, key: Key, 
                 max_retries: int = 2, sleep_time: int = 10, 
                 fail_silently=False, *args, **kwargs):
        api = SignalsAPI(
            public_id=key.pub_id, secret_key=key.secret_key, *args, **kwargs
        )
        super().__init__(
            directory_path=directory_path, api=api,
            max_retries=max_retries, sleep_time=sleep_time,
            fail_silently=fail_silently
        )
        self.supported_ticker_formats = [
            "cusip",
            "sedol",
            "ticker",
            "numerai_ticker",
            "bloomberg_ticker",
        ]

    def save_csv(
            self,
            dataf: pd.DataFrame,
            cols: list,
            file_name: str = "submission.csv",
            *args, **kwargs
    ):
        """
        :param dataf: DataFrame which should have at least the following columns:
         1. One of supported ticker formats (cusip, sedol, ticker, numerai_ticker or bloomberg_ticker)
         2. signal (Values between 0 and 1 (exclusive))
         Additional columns for if you include validation data (optional):
         3. date (YYYY-MM-DD format date indication)
         4. data_type ('val' and 'live' partitions)

         :param cols: All cols that are saved in CSV.
         cols should contain at least 1 ticker column and a 'signal' column.
         For example: ['bloomberg_ticker', 'signal']
         :param file_name: .csv file path.
        """
        self._check_ticker_format(cols=cols)
        self._check_value_range(dataf=dataf, cols="signal")

        full_path = str(self.dir / file_name)
        print(
            f"Saving Signals predictions CSV to '{full_path}'."
        )
        dataf.loc[:, cols].reset_index(drop=True).to_csv(
            full_path, index=False, *args, **kwargs
        )

    def _check_ticker_format(self, cols: list):
        """ Check for valid ticker format. """
        valid_tickers = set(cols).intersection(set(self.supported_ticker_formats))
        if not valid_tickers:
            raise NotImplementedError(
                f"No supported ticker format in {cols}). \
Supported: '{self.supported_ticker_formats}'"
            )

save_csv(dataf, cols, file_name='submission.csv', *args, **kwargs)

:param dataf: DataFrame which should have at least the following columns: 1. One of supported ticker formats (cusip, sedol, ticker, numerai_ticker or bloomberg_ticker) 2. signal (Values between 0 and 1 (exclusive)) Additional columns for if you include validation data (optional): 3. date (YYYY-MM-DD format date indication) 4. data_type ('val' and 'live' partitions)

:param cols: All cols that are saved in CSV. cols should contain at least 1 ticker column and a 'signal' column. For example: ['bloomberg_ticker', 'signal'] :param file_name: .csv file path.

Source code in numerblox/submission.py
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
def save_csv(
        self,
        dataf: pd.DataFrame,
        cols: list,
        file_name: str = "submission.csv",
        *args, **kwargs
):
    """
    :param dataf: DataFrame which should have at least the following columns:
     1. One of supported ticker formats (cusip, sedol, ticker, numerai_ticker or bloomberg_ticker)
     2. signal (Values between 0 and 1 (exclusive))
     Additional columns for if you include validation data (optional):
     3. date (YYYY-MM-DD format date indication)
     4. data_type ('val' and 'live' partitions)

     :param cols: All cols that are saved in CSV.
     cols should contain at least 1 ticker column and a 'signal' column.
     For example: ['bloomberg_ticker', 'signal']
     :param file_name: .csv file path.
    """
    self._check_ticker_format(cols=cols)
    self._check_value_range(dataf=dataf, cols="signal")

    full_path = str(self.dir / file_name)
    print(
        f"Saving Signals predictions CSV to '{full_path}'."
    )
    dataf.loc[:, cols].reset_index(drop=True).to_csv(
        full_path, index=False, *args, **kwargs
    )