Spaces:

agagliano
/

relaiss_search

Sleeping

App Files Files Community

alexandergagliano commited on May 13

Commit

e41deef

1 Parent(s): 271a1e6

restructure and prepare for pypi release

Browse files

Files changed (10) hide show

LICENSE +21 -0
README.md +3 -1
notebooks/.ipynb_checkpoints/01_relaiss_basics-checkpoint.ipynb +0 -0
pyproject.toml +124 -0
{code → src/relaiss}/constants.py +0 -0
{code → src/relaiss}/helper_func.py +293 -16
{code → src/relaiss}/lightcurve_engineer.py +128 -2
{code → src/relaiss}/relaiss_func.py +179 -4
static/reLAISS_logo.png +3 -0
tests/test_search.py +30 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,4 +1,6 @@
-# reLAISS
 _A flexible library for similarity searches of supernovae and their host galaxies._

+<p align="center">
+  <img src="https://github.com/evan-reynolds/re-laiss/blob/main/static/reLAISS_logo.png" style="width: 50%;" alt="reLAISS Logo">
+</p>
 _A flexible library for similarity searches of supernovae and their host galaxies._

notebooks/.ipynb_checkpoints/01_relaiss_basics-checkpoint.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,124 @@

+[project]
+name = "relaiss"
+license = {file = "LICENSE"}
+readme = "README.md"
+authors = [
+    { name = "Evan Reynolds", email = ""}
+    { name = "Alex Gagliano", email = "[email protected]" }
+    { name = "Ashley Villar", email = "[email protected]"}
+]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "License :: OSI Approved :: MIT License",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python",
+]
+dynamic = ["version"]
+requires-python = ">=3.8"
+dependencies = [
+  "numpy",
+  "astropy",
+  "matplotlib",
+  "pandas",
+  "scikit-learn",
+  "scipy",
+  "requests",
+  "sfdmap; python_version <= '3.9'",
+  "sfdmap2; python_version >= '3.9'",
+]
+[project.urls]
+"Source Code" = "https://github.com/evan-reynolds/re-laiss/"
+[project.optional-dependencies]
+dev = [
+    "asv==0.6.4", # Used to compute performance benchmarks
+    "jupyter", # Clears output from Jupyter notebooks
+    "pre-commit", # Used to run checks before finalizing a git commit
+    "pytest",
+    "pytest-cov", # Used to report total code coverage
+    "ruff", # Used for static linting of files
+]
+[build-system]
+requires = [
+    "setuptools>=62", # Used to build and package the Python project
+    "setuptools_scm>=6.2", # Gets release version from git. Makes it available programmatically
+]
+build-backend = "setuptools.build_meta"
+[tool.setuptools]
+include-package-data = true
+[tool.setuptools.package-data]
+"relaiss" = ["data/*csv*"]
+[tool.setuptools_scm]
+write_to = "src/relaiss/_version.py"
+local_scheme = "no-local-version"
+version_scheme = "no-guess-dev"
+[tool.pytest.ini_options]
+testpaths = [
+    "tests",
+]
+[tool.black]
+line-length = 110
+target-version = ["py39"]
+[tool.isort]
+profile = "black"
+line_length = 110
+[tool.ruff]
+line-length = 110
+target-version = "py39"
+[tool.ruff.lint]
+select = [
+    # pycodestyle
+    "E",
+    "W",
+    # Pyflakes
+    "F",
+    # pep8-naming
+    "N",
+    # pyupgrade
+    "UP",
+    # flake8-bugbear
+    "B",
+    # flake8-simplify
+    "SIM",
+    # isort
+    "I",
+    # docstrings
+    "D101",
+    "D102",
+    "D103",
+    "D106",
+    "D206",
+    "D207",
+    "D208",
+    "D300",
+    "D417",
+    "D419",
+    # Numpy v2.0 compatibility
+    "NPY201",
+]
+ignore = [
+    "UP006", # Allow non standard library generics in type hints
+    "UP007", # Allow Union in type hints
+    "SIM114", # Allow if with same arms
+    "B028", # Allow default warning level
+    "SIM117", # Allow nested with
+    "UP015", # Allow redundant open parameters
+    "UP028", # Allow yield in for loop
+]
+[tool.coverage.run]
+omit=["src/relaiss/_version.py"]

{code → src/relaiss}/constants.py RENAMED Viewed

File without changes

{code → src/relaiss}/helper_func.py RENAMED Viewed

@@ -28,10 +28,24 @@ from scipy.stats import gamma, uniform
 from dust_extinction.parameter_averages import G23
 from astro_prost.associate import associate_sample
 @contextmanager
 def re_suppress_output():
-    """Temporarily silence stdout, stderr, warnings *and* all logging messages < CRITICAL."""
     with open(os.devnull, "w") as devnull:
         old_stdout, old_stderr = sys.stdout, sys.stderr
         sys.stdout, sys.stderr = devnull, devnull
@@ -47,6 +61,19 @@ def re_suppress_output():
 def re_getTnsData(ztf_id):
     locus = antares_client.search.get_by_ztf_object_id(ztf_object_id=ztf_id)
     try:
         tns = locus.catalog_objects["tns_public_objects"][0]
@@ -64,6 +91,25 @@ def re_getExtinctionCorrectedMag(
     av_in_raw_df_bank,
     path_to_sfd_folder=None,
 ):
     central_wv_filters = {"g": 4849.11, "r": 6201.20, "i": 7534.96, "z": 8674.20}
     MW_RV = 3.1
     ext = G23(Rv=MW_RV)
@@ -90,7 +136,36 @@ def re_build_dataset_bank(
     building_entire_df_bank=False,
     building_for_AD=False,
 ):
     raw_lc_features = constants.lc_features_const.copy()
     raw_host_features = constants.raw_host_features_const.copy()
@@ -232,6 +307,41 @@ def re_extract_lc_and_host_features(
     building_for_AD=False,
     swapped_host=False,
 ):
     start_time = time.time()
     df_path = path_to_timeseries_folder
@@ -492,8 +602,19 @@ def re_extract_lc_and_host_features(
 def _ps1_list_filenames(ra_deg, dec_deg, flt):
-    """
-    Return the first stack FITS filename for (ra,dec) and *flt* or None.
     """
     url = (
         "https://ps1images.stsci.edu/cgi-bin/ps1filenames.py"
@@ -509,8 +630,26 @@ def _ps1_list_filenames(ra_deg, dec_deg, flt):
 def fetch_ps1_cutout(ra_deg, dec_deg, *, size_pix=100, flt="r"):
-    """
-    Grayscale cut-out (2-D float) in a single PS1 filter.
     """
     fits_name = _ps1_list_filenames(ra_deg, dec_deg, flt)
     if fits_name is None:
@@ -537,9 +676,21 @@ def fetch_ps1_cutout(ra_deg, dec_deg, *, size_pix=100, flt="r"):
 def fetch_ps1_rgb_jpeg(ra_deg, dec_deg, *, size_pix=100):
-    """
-    Colour JPEG (H,W,3  uint8) using PS1 g/r/i stacks.
-    Falls back by *raising* RuntimeError when the server lacks colour data.
     """
     url = (
         "https://ps1images.stsci.edu/cgi-bin/fitscut.cgi"
@@ -567,6 +718,33 @@ def re_plot_lightcurves(
     figure_path,
     save_figures=True,
 ):
     print("Making a plot of stacked lightcurves...")
     if primer_dict["lc_tns_z"] is None:
@@ -737,12 +915,35 @@ def re_plot_hosts(
     change_contrast=False,
     prefer_color=True,
 ):
-    """
-    Build 3×3 grids of PS1 thumbnails for each row in *df* and write a PDF.
-    Set *prefer_color=False* for r-band grayscale only.  With *prefer_color=True*
-    (default) the code *tries* colour first and quietly falls back to grayscale
-    when colour isn’t available.
     """
     host_grid_path = figure_path + "/host_grids"
@@ -833,6 +1034,40 @@ def re_check_anom_and_plot(
     savefig,
     figure_path,
 ):
     anom_obj_df = timeseries_df_features_only
     pred_prob_anom = 100 * clf.predict_proba(anom_obj_df)
@@ -1000,6 +1235,30 @@ def re_get_timeseries_df(
     building_for_AD=False,
     swapped_host=False,
 ):
     if theorized_lightcurve_df is not None:
         print("Extracting full lightcurve features for theorized lightcurve...")
         timeseries_df = re_extract_lc_and_host_features(
@@ -1048,6 +1307,24 @@ def re_get_timeseries_df(
 def create_re_laiss_features_dict(
     lc_feature_names, host_feature_names, lc_groups=4, host_groups=4
 ):
     re_laiss_features_dict = {}
     # Split light curve features into evenly sized chunks

 from dust_extinction.parameter_averages import G23
 from astro_prost.associate import associate_sample
 @contextmanager
 def re_suppress_output():
+    """Context-manager that silences *everything* except CRITICAL logs.
+    Temporarily redirects ``stdout``/``stderr`` to ``os.devnull``, ignores
+    warnings, and disables the root logger for messages < ``logging.CRITICAL``.
+    Restores all streams and the logger state on exit.
+    Yields
+    ------
+    None
+        Used only for the ``with`` context block.
+    Examples
+    --------
+    >>> with re_suppress_output():
+    ...     noisy_function()
+    """
     with open(os.devnull, "w") as devnull:
         old_stdout, old_stderr = sys.stdout, sys.stderr
         sys.stdout, sys.stderr = devnull, devnull
 def re_getTnsData(ztf_id):
+    """Fetch the TNS cross-match for a given ZTF object.
+    Parameters
+    ----------
+    ztf_id : str
+        ZTF object ID, e.g. ``"ZTF23abcxyz"``.
+    Returns
+    -------
+    tuple[str, str, float]
+        *(tns_name, tns_type, tns_redshift)*.  Values default to
+        ``("No TNS", "---", -99)`` when no match or metadata are present.
+    """
     locus = antares_client.search.get_by_ztf_object_id(ztf_object_id=ztf_id)
     try:
         tns = locus.catalog_objects["tns_public_objects"][0]
     av_in_raw_df_bank,
     path_to_sfd_folder=None,
 ):
+    """Milky-Way extinction-corrected Kron magnitude for one passband.
+    Parameters
+    ----------
+    transient_row : pandas.Series
+        Row from the raw host-feature DataFrame.
+    band : {'g', 'r', 'i', 'z'}
+        Photometric filter to correct.
+    av_in_raw_df_bank : bool
+        If *True* use ``transient_row["A_V"]`` directly; otherwise compute
+        E(B−V) from the SFD dust map in *path_to_sfd_folder*.
+    path_to_sfd_folder : str | pathlib.Path | None, optional
+        Folder containing *SFDMap* dust files when A_V is not pre-computed.
+    Returns
+    -------
+    float
+        Extinction-corrected Kron magnitude.
+    """
     central_wv_filters = {"g": 4849.11, "r": 6201.20, "i": 7534.96, "z": 8674.20}
     MW_RV = 3.1
     ext = G23(Rv=MW_RV)
     building_entire_df_bank=False,
     building_for_AD=False,
 ):
+    """Clean, impute, dust-correct, and engineer features for reLAISS.
+    Handles both archival and *theorized* light-curve inputs, performs KNN or
+    mean imputation, builds colour indices, propagates uncertainties, and
+    returns a ready-to-index DataFrame.
+    Parameters
+    ----------
+    raw_df_bank : pandas.DataFrame
+        Input light-curve + host-galaxy features (one or many rows).
+    av_in_raw_df_bank : bool
+        Whether A_V is already present in *raw_df_bank*.
+    path_to_sfd_folder : str | Path | None, optional
+        Directory with SFD dust maps (required if ``av_in_raw_df_bank=False``).
+    theorized : bool, default False
+        Set *True* when the input is a simulated/theoretical light curve that
+        lacks host features.
+    path_to_dataset_bank : str | Path | None, optional
+        Existing bank used to fit the imputer when not building the entire set.
+    building_entire_df_bank : bool, default False
+        If *True*, fit the imputer on *raw_df_bank* itself.
+    building_for_AD : bool, default False
+        Use simpler mean imputation and suppress verbose prints for
+        anomaly-detection pipelines.
+    Returns
+    -------
+    pandas.DataFrame
+        Fully hydrated feature table indexed by ``ztf_object_id``.
+    """
     raw_lc_features = constants.lc_features_const.copy()
     raw_host_features = constants.raw_host_features_const.copy()
     building_for_AD=False,
     swapped_host=False,
 ):
+    """End-to-end extraction of light-curve **and** host-galaxy features.
+    1. Pulls ZTF photometry from ANTARES (or uses a supplied theoretical LC).
+    2. Computes time-series features with *lightcurve_engineer*.
+    3. Associates the most probable PS1 host with PROST and appends raw host
+       features.
+    4. Dust-corrects, builds colours, imputes gaps, and writes an optional CSV.
+    Parameters
+    ----------
+    ztf_id : str
+        ZTF object identifier (ignored when *theorized_lightcurve_df* is given).
+    path_to_timeseries_folder : str | Path
+        Folder to cache per-object time-series CSVs.
+    path_to_sfd_data_folder : str | Path
+        Location of SFD dust maps.
+    theorized_lightcurve_df : pandas.DataFrame | None, optional
+        Pre-simulated LC in ANTARES column format (``ant_passband``, ``ant_mjd``,
+        ``ant_mag``, ``ant_magerr``).
+    show_lc : bool, default False
+        Plot the g/r light curves.
+    show_host : bool, default True
+        Print PS1 cut-out URL on successful host association.
+    store_csv : bool, default False
+        Write a timeseries CSV next to *path_to_timeseries_folder*.
+    building_for_AD : bool, default False
+        Quieter prints + mean imputation only.
+    swapped_host : bool, default False
+        Indicator used when re-running with an alternate host galaxy.
+    Returns
+    -------
+    pandas.DataFrame
+        Hydrated feature rows for every increasing-epoch subset of the LC.
+    """
     start_time = time.time()
     df_path = path_to_timeseries_folder
 def _ps1_list_filenames(ra_deg, dec_deg, flt):
+    """Return the first PS1 stacked-image FITS filename at (RA, Dec).
+    Parameters
+    ----------
+    ra_deg, dec_deg : float
+        ICRS coordinates in degrees.
+    flt : str
+        PS1 filter letter (``'g' 'r' 'i' 'z' 'y'``).
+    Returns
+    -------
+    str | None
+        Filename, e.g. ``'tess-skycell1001.012-i.fits'``, or *None* when absent.
     """
     url = (
         "https://ps1images.stsci.edu/cgi-bin/ps1filenames.py"
 def fetch_ps1_cutout(ra_deg, dec_deg, *, size_pix=100, flt="r"):
+    """Download a single-filter PS1 FITS cut-out around *(RA, Dec)*.
+    Parameters
+    ----------
+    ra_deg, dec_deg : float
+        ICRS coordinates (degrees).
+    size_pix : int, default 100
+        Width/height of the square cut-out in PS1 pixels.
+    flt : str, default 'r'
+        PS1 filter.
+    Returns
+    -------
+    numpy.ndarray
+        2-D float array (grayscale image).
+    Raises
+    ------
+    RuntimeError
+        When the target lies outside the PS1 footprint or no data exist.
     """
     fits_name = _ps1_list_filenames(ra_deg, dec_deg, flt)
     if fits_name is None:
 def fetch_ps1_rgb_jpeg(ra_deg, dec_deg, *, size_pix=100):
+    """Fetch an RGB JPEG cut-out (g/r/i) from PS1.
+    Falls back via *raising* ``RuntimeError`` when PS1 lacks colour data.
+    Parameters
+    ----------
+    ra_deg, dec_deg : float
+        ICRS coordinates (degrees).
+    size_pix : int, default 100
+        Square cut-out size in pixels.
+    Returns
+    -------
+    numpy.ndarray
+        ``(H, W, 3)`` uint8 array in RGB order.
     """
     url = (
         "https://ps1images.stsci.edu/cgi-bin/fitscut.cgi"
     figure_path,
     save_figures=True,
 ):
+    """Stack reference + neighbour light curves in a single figure.
+    Parameters
+    ----------
+    primer_dict : dict
+        Metadata for the reference transient (e.g., TNS name/class/redshift).
+    plot_label : str
+        Text used for figure title and filename.
+    theorized_lightcurve_df : pandas.DataFrame | None
+        Optional simulated LC to plot as the reference.
+    neighbor_ztfids : list[str]
+        ZTF IDs of retrieved neighbours (<= 8 plotted).
+    ann_locus_l : list[antares_client.objects.Locus]
+        Corresponding ANTARES loci holding photometry.
+    ann_dists : list[float]
+        ANN distances for labeling.
+    tns_ann_names, tns_ann_classes, tns_ann_zs : list
+        TNS metadata for neighbours.
+    figure_path : str | Path
+        Root folder to save PNGs in ``lightcurves/``.
+    save_figures : bool, default True
+        Write the PNG to disk.
+    Returns
+    -------
+    None
+    """
     print("Making a plot of stacked lightcurves...")
     if primer_dict["lc_tns_z"] is None:
     change_contrast=False,
     prefer_color=True,
 ):
+    """Create 3×3 PS1 thumbnail grids for candidate host galaxies.
+    Saves each page to a multi-page PDF and optionally shows colour cut-outs
+    when available.
+    Parameters
+    ----------
+    ztfid_ref : str
+        Reference transient ID (title use only).
+    plot_label : str
+        Basename for the output PDF.
+    df : pandas.DataFrame
+        Table with ``ZTFID``, ``HOST_RA``, ``HOST_DEC`` columns.
+    figure_path : str | Path
+        Destination directory for ``host_grids/*.pdf``.
+    ann_num : int
+        ANN neighbour index (used in filename).
+    save_pdf : bool, default True
+        Whether to write the PDF.
+    imsizepix : int, default 100
+        PS1 cut-out size in pixels.
+    change_contrast : bool, default False
+        Use a shallower stretch (93 %) for grayscale images.
+    prefer_color : bool, default True
+        Try RGB first, fall back to r-band grayscale.
+    Returns
+    -------
+    None
     """
     host_grid_path = figure_path + "/host_grids"
     savefig,
     figure_path,
 ):
+    """Run anomaly-detector probabilities over a time-series and plot results.
+    Produces a two-panel figure: light curve with anomaly epoch marked, and
+    rolling anomaly/normal probabilities.
+    Parameters
+    ----------
+    clf : sklearn.base.ClassifierMixin
+        Trained binary classifier with ``predict_proba``.
+    input_ztf_id : str
+        ID of the object evaluated.
+    swapped_host_ztf_id : str | None
+        Alternate host ID (annotated in title).
+    input_spec_cls : str | None
+        Spectroscopic class label for title.
+    input_spec_z : float | str | None
+        Redshift for title.
+    anom_thresh : float
+        Probability (%) above which an epoch is flagged anomalous.
+    timeseries_df_full : pandas.DataFrame
+        Hydrated LC + host features, including ``obs_num`` and ``mjd_cutoff``.
+    timeseries_df_features_only : pandas.DataFrame
+        Same rows but feature columns only (classifier input).
+    ref_info : antares_client.objects.Locus
+        ANTARES locus for retrieving original photometry.
+    savefig : bool
+        Save the plot as ``AD/*.pdf`` inside *figure_path*.
+    figure_path : str | Path
+        Output directory.
+    Returns
+    -------
+    None
+    """
     anom_obj_df = timeseries_df_features_only
     pred_prob_anom = 100 * clf.predict_proba(anom_obj_df)
     building_for_AD=False,
     swapped_host=False,
 ):
+    """Retrieve or build a fully-hydrated time-series feature DataFrame.
+    Checks disk cache; otherwise calls
+    ``re_extract_lc_and_host_features`` and optionally writes the CSV.
+    Parameters
+    ----------
+    ztf_id : str
+    path_to_timeseries_folder : str | Path
+    path_to_sfd_data_folder : str | Path
+    theorized_lightcurve_df : pandas.DataFrame | None
+        If provided, builds features for a simulated LC.
+    save_timeseries : bool, default False
+        Persist CSV to disk.
+    path_to_dataset_bank : str | Path | None
+        Reference bank for imputers.
+    building_for_AD : bool, default False
+    swapped_host : bool, default False
+    Returns
+    -------
+    pandas.DataFrame
+        Feature rows ready for indexing or AD.
+    """
     if theorized_lightcurve_df is not None:
         print("Extracting full lightcurve features for theorized lightcurve...")
         timeseries_df = re_extract_lc_and_host_features(
 def create_re_laiss_features_dict(
     lc_feature_names, host_feature_names, lc_groups=4, host_groups=4
 ):
+    """Partition feature names into evenly-sized groups for weighting.
+    Parameters
+    ----------
+    lc_feature_names : list[str]
+        Names of light-curve features.
+    host_feature_names : list[str]
+        Names of host-galaxy features.
+    lc_groups : int, default 4
+        Number of LC groups in the output dict.
+    host_groups : int, default 4
+        Number of host groups in the output dict.
+    Returns
+    -------
+    dict[str, list[str]]
+        ``{'lc_group_1': [...], 'host_group_1': [...], ...}``
+    """
     re_laiss_features_dict = {}
     # Split light curve features into evenly sized chunks

{code → src/relaiss}/lightcurve_engineer.py RENAMED Viewed

@@ -6,11 +6,25 @@ from sfdmap2 import sfdmap
 from dust_extinction.parameter_averages import G23
 from numpy.lib.stride_tricks import sliding_window_view
 import warnings
 warnings.filterwarnings("ignore", category=RuntimeWarning)
 def local_curvature(times, mags):
     if len(times) < 3:
         return np.nan
     curvatures = []
@@ -31,6 +45,14 @@ m = sfdmap.SFDMap()
 class SupernovaFeatureExtractor:
     @staticmethod
     def describe_features():
         return {
             "t0": "Time zero-point for light curve normalization",
             "g_peak_mag": "Minimum magnitude (brightest point) in g band",
@@ -77,6 +99,22 @@ class SupernovaFeatureExtractor:
     def __init__(
         self, time_g, mag_g, err_g, time_r, mag_r, err_r, ZTFID=None, ra=None, dec=None
     ):
         if ZTFID:
             self.ZTFID = ZTFID
         else:
@@ -107,6 +145,12 @@ class SupernovaFeatureExtractor:
         self._preprocess()
     def _preprocess(self, min_cluster_size=2):
         for band_name in ["g", "r"]:
             band = getattr(self, band_name)
             idx = np.argsort(band["time"])
@@ -146,8 +190,10 @@ class SupernovaFeatureExtractor:
             self.time_offset += new_time_offset
     def _select_main_cluster(self, time, mag, min_samples=3, eps=20):
-        from sklearn.cluster import DBSCAN
         if len(time) < min_samples:
             return np.ones_like(time, dtype=bool)
         time_reshaped = np.array(time).reshape(-1, 1)
@@ -171,6 +217,13 @@ class SupernovaFeatureExtractor:
         return labels == best_label
     def _flag_isolated_points(time, max_gap_factor=5):
         time = np.sort(time)
         dt = np.diff(time)
@@ -188,6 +241,22 @@ class SupernovaFeatureExtractor:
         return isolated
     def _core_stats(self, band):
         t, m = band["time"], band["mag"]
         mask = np.isfinite(t) & np.isfinite(m) & ~np.isnan(m)
         t, m = t[mask], m[mask]
@@ -216,6 +285,13 @@ class SupernovaFeatureExtractor:
         return peak_mag, peak_time, rise_time, decline_time, duration
     def _variability_stats(self, band):
         mag = band["mag"]
         amp = np.max(mag) - np.min(mag)
         std = np.std(mag)
@@ -225,6 +301,14 @@ class SupernovaFeatureExtractor:
         return amp, skew, beyond_2
     def _color_features(self):
         if len(self.g["time"]) < 2 or len(self.r["time"]) < 2:
             # print("Warning: Not enough data in g or r band to compute color features.")
             return None
@@ -261,6 +345,18 @@ class SupernovaFeatureExtractor:
         return np.mean(color), gr_at_gpeak, mean_rate
     def _rolling_variance(self, band, window_size=5):
         def dedup(t, m):
             _, idx = np.unique(t, return_index=True)
             return t[idx], m[idx]
@@ -275,6 +371,13 @@ class SupernovaFeatureExtractor:
         return np.max(rolling_vars), np.mean(rolling_vars)
     def _peak_structure(self, band):
         if np.ptp(band["mag"]) < 0.5:
             # print("Warning: Insufficient variability to identify peak structure.")
             return 0, np.nan, np.nan, np.nan, np.nan
@@ -299,6 +402,13 @@ class SupernovaFeatureExtractor:
         return n_peaks, dt, dmag, prominence_second, width_second
     def _local_curvature_features(self, band, window_days=20):
         t, m = band["time"], band["mag"]
         mask = np.isfinite(t) & np.isfinite(m)
         t, m = t[mask], m[mask]
@@ -330,6 +440,22 @@ class SupernovaFeatureExtractor:
         return rise_curv, decline_curv
     def extract_features(self, return_uncertainty=False, n_trials=20):
         if len(self.g["time"]) == 0 or len(self.r["time"]) == 0:
             # print(
             #     f"Warning: No data left in g or r band after filtering for object: {self.ZTFID}. Skipping."

 from dust_extinction.parameter_averages import G23
 from numpy.lib.stride_tricks import sliding_window_view
 import warnings
+from sklearn.cluster import DBSCAN
 warnings.filterwarnings("ignore", category=RuntimeWarning)
 def local_curvature(times, mags):
+    """Median second derivative (curvature) of a light-curve segment.
+    Parameters
+    ----------
+    times : array-like
+        Strictly increasing observation times (days).
+    mags : array-like
+        Corresponding magnitudes.
+    Returns
+    -------
+    float
+        Median curvature in mag day⁻²; ``np.nan`` if fewer than three points.
+    """
     if len(times) < 3:
         return np.nan
     curvatures = []
 class SupernovaFeatureExtractor:
     @staticmethod
     def describe_features():
+        """Dictionary mapping feature names → human-readable descriptions.
+        Returns
+        -------
+        dict[str, str]
+            Keys follow the column names produced by
+            :pymeth:`SupernovaFeatureExtractor.extract_features`.
+        """
         return {
             "t0": "Time zero-point for light curve normalization",
             "g_peak_mag": "Minimum magnitude (brightest point) in g band",
     def __init__(
         self, time_g, mag_g, err_g, time_r, mag_r, err_r, ZTFID=None, ra=None, dec=None
     ):
+        """Create a feature extractor for g/r light curves.
+        Times are zero-pointed to the earliest observation; optional Milky-Way
+        extinction is applied when *ra/dec* are supplied.
+        Parameters
+        ----------
+        time_g, mag_g, err_g : array-like
+            g-band MJD, magnitude and 1-σ uncertainty.
+        time_r, mag_r, err_r : array-like
+            r-band MJD, magnitude and 1-σ uncertainty.
+        ZTFID : str | None, optional
+            Identifier used in warnings and output tables.
+        ra, dec : float | None, optional
+            ICRS coordinates (deg) for dust-extinction correction.
+        """
         if ZTFID:
             self.ZTFID = ZTFID
         else:
         self._preprocess()
     def _preprocess(self, min_cluster_size=2):
+        """Sort, de-duplicate, and DBSCAN-filter out isolated epochs.
+        Removes cluster labels with fewer than *min_cluster_size* points and
+        re-normalises times so that ``t=0`` corresponds to the earliest good
+        observation in either band.
+        """
         for band_name in ["g", "r"]:
             band = getattr(self, band_name)
             idx = np.argsort(band["time"])
             self.time_offset += new_time_offset
     def _select_main_cluster(self, time, mag, min_samples=3, eps=20):
+        """Return a boolean mask selecting the dominant DBSCAN time cluster.
+        The cluster with the brightest peak and tightest span wins the tie-break.
+        """
         if len(time) < min_samples:
             return np.ones_like(time, dtype=bool)
         time_reshaped = np.array(time).reshape(-1, 1)
         return labels == best_label
     def _flag_isolated_points(time, max_gap_factor=5):
+        """Identify photometric points that are isolated by large temporal gaps.
+        Returns
+        -------
+        numpy.ndarray[bool]
+            True for epochs flanked by gaps > *max_gap_factor* × median cadence.
+        """
         time = np.sort(time)
         dt = np.diff(time)
         return isolated
     def _core_stats(self, band):
+        """Peak, rise/decline and half-flux duration for one band.
+        Parameters
+        ----------
+        band : dict
+            ``{'time','mag'}`` arrays for a single filter.
+        Returns
+        -------
+        tuple
+            *(peak_mag, peak_time, rise_time, decline_time, duration_above_half)*
+        Notes
+        -----
+        All values are ``np.nan`` if <3 points or total peak-to-peak amplitude <0.2 mag.
+        """
         t, m = band["time"], band["mag"]
         mask = np.isfinite(t) & np.isfinite(m) & ~np.isnan(m)
         t, m = t[mask], m[mask]
         return peak_mag, peak_time, rise_time, decline_time, duration
     def _variability_stats(self, band):
+        """Amplitude, skewness, and 2-σ outlier rate of a magnitude series.
+        Returns
+        -------
+        tuple
+            *(amplitude, skewness, fraction_beyond_2σ)*
+        """
         mag = band["mag"]
         amp = np.max(mag) - np.min(mag)
         std = np.std(mag)
         return amp, skew, beyond_2
     def _color_features(self):
+        """Compute mean g–r colour, g–r at g-band peak, and average colour slope.
+        Returns
+        -------
+        tuple
+            ``(mean_colour, colour_at_g_peak, mean_dcolour_dt)``
+            or ``None`` when bands lack overlap.
+        """
         if len(self.g["time"]) < 2 or len(self.r["time"]) < 2:
             # print("Warning: Not enough data in g or r band to compute color features.")
             return None
         return np.mean(color), gr_at_gpeak, mean_rate
     def _rolling_variance(self, band, window_size=5):
+        """Max & mean variance in sliding windows over an interpolated LC.
+        Parameters
+        ----------
+        window_size : int, default 5
+            Number of interpolated samples per window.
+        Returns
+        -------
+        tuple
+            *(max_var, mean_var)*
+        """
         def dedup(t, m):
             _, idx = np.unique(t, return_index=True)
             return t[idx], m[idx]
         return np.max(rolling_vars), np.mean(rolling_vars)
     def _peak_structure(self, band):
+        """Secondary-peak diagnostics using SciPy ``find_peaks``.
+        Returns
+        -------
+        tuple
+            *(n_peaks, Δt, Δmag, prominence₂, width₂)* with NaNs when <2 peaks.
+        """
         if np.ptp(band["mag"]) < 0.5:
             # print("Warning: Insufficient variability to identify peak structure.")
             return 0, np.nan, np.nan, np.nan, np.nan
         return n_peaks, dt, dmag, prominence_second, width_second
     def _local_curvature_features(self, band, window_days=20):
+        """Median curvature on the rise and decline within ±*window_days* of peak.
+        Returns
+        -------
+        tuple
+            ``(rise_curvature, decline_curvature)``
+        """
         t, m = band["time"], band["mag"]
         mask = np.isfinite(t) & np.isfinite(m)
         t, m = t[mask], m[mask]
         return rise_curv, decline_curv
     def extract_features(self, return_uncertainty=False, n_trials=20):
+        """Generate the full reLAISS feature vector for the supplied LC.
+        Parameters
+        ----------
+        return_uncertainty : bool, default False
+            If True, performs *n_trials* MC perturbations and appends 1-σ errors
+            (columns with ``_err`` suffix).
+        n_trials : int, default 20
+            Number of Monte-Carlo resamples when *return_uncertainty* is True.
+        Returns
+        -------
+        pandas.DataFrame | None
+            Single-row feature table (with optional error columns) or *None* when
+            either band lacks data after pre-processing.
+        """
         if len(self.g["time"]) == 0 or len(self.r["time"]) == 0:
             # print(
             #     f"Warning: No data left in g or r band after filtering for object: {self.ZTFID}. Skipping."

{code → src/relaiss}/relaiss_func.py RENAMED Viewed

@@ -13,7 +13,6 @@ from kneed import KneeLocator
 from pyod.models.iforest import IForest
 from statsmodels import robust
 def re_build_indexed_sample(
     dataset_bank_path,
     lc_features=[],
@@ -26,6 +25,42 @@ def re_build_indexed_sample(
     force_recreation_of_index=False,
     weight_lc_feats_factor=1,
 ):
     df_bank = pd.read_csv(dataset_bank_path)
     # Confirm that the first column is the ZTF ID, and index by ZTF ID
@@ -138,7 +173,38 @@ def re_LAISS_primer(
     host_features=[],
     num_sims=10,
 ):
     feature_names = lc_features + host_features
     if lc_ztf_id is not None and theorized_lightcurve_df is not None:
         print(
@@ -349,6 +415,34 @@ def re_LAISS_nearest_neighbors(
     save_figures=True,
     path_to_figure_directory="../figures",
 ):
     start_time = time.time()
     index_file = annoy_index_file_stem + ".ann"
@@ -676,6 +770,23 @@ def re_train_AD_model(
     max_samples=1024,
     force_retrain=False,
 ):
     feature_names = lc_features + host_features
     df_bank_path = path_to_dataset_bank
     model_dir = path_to_models_directory
@@ -742,6 +853,28 @@ def re_anomaly_detection(
     max_samples=1024,
     force_retrain=False,
 ):
     print("Running Anomaly Detection:\n")
     # Train the model (if necessary)
@@ -842,7 +975,32 @@ def re_LAISS(
     force_AD_retrain=False,  # Retrains and saves AD model even if it already exists
     save_figures=True,  # Saves all figures while running LAISS
 ):
     if run_NN or suggest_neighbor_num:
         # build ANNOY indexed sample from dataset bank
         index_stem_name_with_path = re_build_indexed_sample(
@@ -915,7 +1073,6 @@ def re_LAISS(
     return
-# Note: old corner plots in the figure directory will be overwritten!
 def re_corner_plot(
     neighbors_df,  # from reLAISS nearest neighbors
     primer_dict,  # from reLAISS nearest neighbors
@@ -924,6 +1081,24 @@ def re_corner_plot(
     path_to_figure_directory="../figures",
     save_plots=True,
 ):
     if primer_dict is None:
         raise ValueError(
             "primer_dict is None. Try running NN search with reLAISS again."

 from pyod.models.iforest import IForest
 from statsmodels import robust
 def re_build_indexed_sample(
     dataset_bank_path,
     lc_features=[],
     force_recreation_of_index=False,
     weight_lc_feats_factor=1,
 ):
+    """Create (or load) an ANNOY index over a reference feature bank.
+    Parameters
+    ----------
+    dataset_bank_path : str | Path
+        CSV produced by ``re_build_dataset_bank``; first column must be
+        ``ztf_object_id``.
+    lc_features, host_features : list[str]
+        Feature columns to include in the index.
+        Provide one or both lists.
+    use_pca : bool, default False
+        Apply PCA before indexing.
+    n_components : int | None
+        Dimensionality of PCA space; ignored if *use_pca=False*.
+    num_trees : int, default 1000
+        Number of random projection trees for ANNOY.
+    path_to_index_directory : str | Path, default ""
+        Folder for ``*.ann`` plus ``*.npy`` support files.
+    save : bool, default True
+        Persist index and numpy arrays.
+    force_recreation_of_index : bool, default False
+        Rebuild even when an index file already exists.
+    weight_lc_feats_factor : float, default 1
+        Scalar >1 up-weights LC columns relative to host features
+        (ignored if *use_pca=True*).
+    Returns
+    -------
+    str
+        Stem path (without ``.ann`` extension) of the built/loaded index.
+    Raises
+    ------
+    ValueError
+        When feature inputs are invalid or required columns are missing.
+    """
     df_bank = pd.read_csv(dataset_bank_path)
     # Confirm that the first column is the ZTF ID, and index by ZTF ID
     host_features=[],
     num_sims=10,
 ):
+    """Assemble input feature vectors (and MC replicas) for a query object.
+    Combines LC + host features—optionally swapping in a different host—and
+    returns a dict used later by NN and AD stages.
+    Parameters
+    ----------
+    lc_ztf_id : str | None
+        ZTF ID of the transient to query.  Mutually exclusive with
+        *theorized_lightcurve_df*.
+    theorized_lightcurve_df : pandas.DataFrame | None
+        Pre-computed ANTARES-style LC for a theoretical model.
+    host_ztf_id : str | None
+        If given, replace the query object’s host features with those of this
+        transient.
+    dataset_bank_path, path_to_timeseries_folder, path_to_sfd_data_folder : str | Path
+        Locations for cached data.
+    lc_features, host_features : list[str]
+        Names of columns to extract.
+    num_sims : int, default 10
+        Number of Monte-Carlo perturbations for uncertainty propagation.
+    Returns
+    -------
+    dict
+        Primer dictionary containing feature arrays, metadata, and MC sims.
+    Raises
+    ------
+    ValueError
+        On inconsistent inputs or missing data.
+    """
     feature_names = lc_features + host_features
     if lc_ztf_id is not None and theorized_lightcurve_df is not None:
         print(
     save_figures=True,
     path_to_figure_directory="../figures",
 ):
+   """Query the ANNOY index and plot nearest-neighbor diagnostics.
+    Parameters
+    ----------
+    primer_dict : dict
+        Output from :func:`re_LAISS_primer`.
+    annoy_index_file_stem : str
+        Stem path returned by :func:`re_build_indexed_sample`.
+    use_pca, num_pca_components : see above
+    n : int, default 8
+        Number of neighbours to return.
+    suggest_neighbor_num : bool, default False
+        If True, plots the distance elbow and exits early.
+    max_neighbor_dist : float | None
+        Optional cut on L1 distance.
+    search_k : int, default 1000
+        ANNOY *search_k* parameter.
+    weight_lc_feats_factor : float, default 1
+        Same interpretation as in ``re_build_indexed_sample``.
+    save_figures : bool, default True
+        Write LC + host plots and distance-elbow PNGs.
+    path_to_figure_directory : str | Path
+    Returns
+    -------
+    pandas.DataFrame | None
+        Table summarising neighbours (or *None* if *suggest_neighbor_num=True*).
+    """
     start_time = time.time()
     index_file = annoy_index_file_stem + ".ann"
     max_samples=1024,
     force_retrain=False,
 ):
+    """Train or load an Isolation-Forest anomaly-detection model.
+    Parameters
+    ----------
+    lc_features, host_features : list[str]
+        Feature columns used by the model.
+    path_to_dataset_bank : str | Path
+    path_to_models_directory : str | Path
+    n_estimators, contamination, max_samples : see *pyod.models.IForest*
+    force_retrain : bool, default False
+        Ignore cached model and retrain.
+    Returns
+    -------
+    str
+        Filesystem path to the saved ``.pkl`` pipeline.
+    """
     feature_names = lc_features + host_features
     df_bank_path = path_to_dataset_bank
     model_dir = path_to_models_directory
     max_samples=1024,
     force_retrain=False,
 ):
+   """Run anomaly detection for a single transient (with optional host swap).
+    Generates an AD probability plot and calls
+    :func:`re_check_anom_and_plot`.
+    Parameters
+    ----------
+    transient_ztf_id : str
+        Target object ID.
+    host_ztf_id_to_swap_in : str | None
+        Replace host features before scoring.
+    lc_features, host_features : list[str]
+    path_* : folders for intermediates, models, and figures.
+    save_figures : bool, default True
+    n_estimators, contamination, max_samples : Isolation-Forest params.
+    force_retrain : bool, default False
+        Pass-through to :func:`re_train_AD_model`.
+    Returns
+    -------
+    None
+    """
     print("Running Anomaly Detection:\n")
     # Train the model (if necessary)
     force_AD_retrain=False,  # Retrains and saves AD model even if it already exists
     save_figures=True,  # Saves all figures while running LAISS
 ):
+    """High-level convenience wrapper: build index → NN search → AD.
+    Combines the *primer*, *nearest-neighbours*, and *anomaly-detection*
+    pipelines with many toggles for experimentation.
+    Parameters
+    ----------
+    transient_ztf_id : str | None
+    theorized_lightcurve_df : pandas.DataFrame | None
+    host_ztf_id_to_swap_in : str | None
+    lc_feature_names, host_feature_names : list[str]
+    neighbors : int
+        Target neighbour count.
+    suggest_neighbor_num : bool
+        Show elbow plot instead of full NN run.
+    run_NN, run_AD : bool
+        Enable/disable each pipeline stage.
+    *Other params*
+        See lower-level helpers for details.
+    Returns
+    -------
+    (pandas.DataFrame | None, dict | None)
+        Neighbours table and primer dict when NN stage executed; otherwise
+        *None*.
+    """
     if run_NN or suggest_neighbor_num:
         # build ANNOY indexed sample from dataset bank
         index_stem_name_with_path = re_build_indexed_sample(
     return
 def re_corner_plot(
     neighbors_df,  # from reLAISS nearest neighbors
     primer_dict,  # from reLAISS nearest neighbors
     path_to_figure_directory="../figures",
     save_plots=True,
 ):
+    """Corner-plot visualisation of feature distributions vs. neighbours.
+    Parameters
+    ----------
+    neighbors_df : pandas.DataFrame
+        Output from :func:`re_LAISS_nearest_neighbors`.
+    primer_dict : dict
+        Output from :func:`re_LAISS_primer`.
+    path_to_dataset_bank : str | Path
+    remove_outliers_bool : bool, default True
+        Apply robust MAD clipping before plotting.
+    save_plots : bool, default True
+        Write PNGs to ``corner_plots/``.
+    Returns
+    -------
+    None
+    """
     if primer_dict is None:
         raise ValueError(
             "primer_dict is None. Try running NN search with reLAISS again."

static/reLAISS_logo.png ADDED Viewed

Git LFS Details

SHA256: a24af358ae92e2f8950d424fafa441ffbdc337b9c1b0818c6b0f3b3a717b5fd1
Pointer size: 132 Bytes
Size of remote file: 1.47 MB

tests/test_search.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import pytest
+import pandas as pd
+import numpy as np
+import relaiss as rl
+@pytest.fixture(scope="session")
+def relaiss_client():
+    """Load the cached reference client once for the whole test session."""
+    try:
+        client = rl.load_reference()
+    except FileNotFoundError as err:
+        pytest.skip(f"Reference index unavailable – {err}")
+    return client
+def test_load_reference_singleton(relaiss_client):
+    c1 = rl.load_reference()
+    c2 = rl.load_reference()
+    assert c1 is c2, "load_reference should cache the client instance"
+def test_find_neighbors_dataframe(relaiss_client):
+    df = rl.find_neighbors("ZTF21abbzjeq", k=5)  # arbitrary real ZTF ID
+    assert isinstance(df, pd.DataFrame)
+    assert list(df.columns) == ["ztfid", "distance"]
+    assert len(df) == 5
+    # Distances should be non-decreasing
+    assert np.all(df["distance"].values[:-1] <= df["distance"].values[1:])