Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
87 commits
Select commit Hold shift + click to select a range
0678f96
Accept the terms of service for conda repos
bpkroth Jul 17, 2025
a3a82f7
Make conda output easier for debugging
bpkroth Jul 17, 2025
fcbfc49
more output adjustments
bpkroth Jul 17, 2025
c666915
Revert "more output adjustments"
bpkroth Jul 17, 2025
2dcfa42
Revert "Make conda output easier for debugging"
bpkroth Jul 17, 2025
9965602
try a suggestion
bpkroth Jul 17, 2025
57b673a
fixups
bpkroth Jul 17, 2025
2084a08
Merge branch 'main' into ci-fixups
bpkroth Jul 17, 2025
ecebe33
type fixups
bpkroth Sep 18, 2025
e7f524d
type checking fixups
bpkroth Sep 18, 2025
1b490ae
use conda by default
bpkroth Sep 18, 2025
9ad7794
small pyright fixups
bpkroth Sep 22, 2025
3c0aa23
Install pre-built pyrfr from conda to workaround build error.
bpkroth Sep 22, 2025
a8d9ccd
ignore a deprecation warning in matplotlib
bpkroth Sep 22, 2025
1862037
fixups
bpkroth Sep 22, 2025
b2530a3
ignore more warnings
bpkroth Sep 22, 2025
d914bcc
Avoid Debian trixie for now since there's no azure-cli package yet. …
bpkroth Sep 22, 2025
b6b5561
Moby has also been removed from Debian Trixie
bpkroth Sep 22, 2025
b059ac3
add more debug logging for missing docker support
bpkroth Sep 23, 2025
021f451
more debugging
bpkroth Sep 23, 2025
ac57951
comments
bpkroth Sep 23, 2025
4462cdf
fixup for local testing
bpkroth Sep 23, 2025
74d94cf
log docker missing warnings
bpkroth Sep 23, 2025
06a9826
Split docker test fixtures out so they can come up and down in parallel.
bpkroth Sep 23, 2025
cba92cd
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Sep 23, 2025
797ac40
comments and revert to moby
bpkroth Sep 23, 2025
07924d6
comments and revert to moby
bpkroth Sep 23, 2025
9870d35
make some more fixtures available
bpkroth Sep 23, 2025
c775d89
quotes fixup
bpkroth Sep 23, 2025
72aec7b
upload the coverage.xml file regardless
bpkroth Sep 23, 2025
9fd6843
comments and port forwarding for doc viewing
bpkroth Sep 23, 2025
95a4ae0
Merge branch 'ci-fixups' into split-docker-tests
bpkroth Sep 23, 2025
6fbd25b
revert
bpkroth Sep 23, 2025
382ab78
revertme: temporarily make docker required to see what the issue is i…
bpkroth Sep 23, 2025
496fba8
more debugging
bpkroth Sep 24, 2025
6298fa4
more debug info
bpkroth Oct 21, 2025
2f7c010
trying to print some extra info about the docker env while in the git…
bpkroth Oct 21, 2025
06d4d8d
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Oct 21, 2025
3621506
more debugging
bpkroth Oct 21, 2025
751028e
more debugging
bpkroth Oct 21, 2025
d4fc3ba
mypy
bpkroth Oct 21, 2025
5ef8132
fix debugging output
bpkroth Oct 21, 2025
74c30f8
add some more debug info
bpkroth Oct 21, 2025
4b090c6
improved debug checks
bpkroth Oct 21, 2025
fd61c79
cleanup
bpkroth Oct 21, 2025
5b28210
cleanup
bpkroth Oct 21, 2025
2c8668f
cleanup
bpkroth Oct 21, 2025
5088cae
comments and sync
bpkroth Oct 21, 2025
f1ca2d0
Merge branch 'ci-fixups' into split-docker-tests
bpkroth Oct 21, 2025
bee6f99
lint fixups
bpkroth Oct 21, 2025
d6107ad
Merge branch 'main' into split-docker-tests
bpkroth Oct 22, 2025
ba46ff6
Apply suggestion from @Copilot
bpkroth Oct 22, 2025
30f87f1
Apply suggestion from @Copilot
bpkroth Oct 22, 2025
30b7d6a
Apply suggestion from @Copilot
bpkroth Oct 22, 2025
a4be9fb
remove unused imports
bpkroth Oct 22, 2025
13efd80
Merge branch 'main' into split-docker-tests
bpkroth Oct 23, 2025
1ec818c
Merge branch 'main' into split-docker-tests
bpkroth Oct 23, 2025
ecb7c96
Merge branch 'main' into split-docker-tests
bpkroth Oct 23, 2025
dc9d51a
Merge branch 'main' into split-docker-tests
motus Oct 23, 2025
633bf41
Merge branch 'main' into split-docker-tests
motus Oct 23, 2025
8718d65
add retry logic for connecting
bpkroth Oct 23, 2025
f44a4ee
reduce errors handled for now
bpkroth Oct 23, 2025
d405bfb
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Oct 23, 2025
6d7d3f7
Merge branch 'main' into ssh-test-fixups
motus Oct 24, 2025
e6af465
Merge branch 'main' into ssh-test-fixups
motus Oct 25, 2025
8334ede
avoid mypy error
bpkroth Oct 27, 2025
4458b57
copilot debugging
bpkroth Oct 28, 2025
5a79292
capture all output
bpkroth Oct 28, 2025
59893d7
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Oct 28, 2025
991d504
more debugging hacks
bpkroth Oct 28, 2025
ca59432
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Oct 28, 2025
5904817
more logs
bpkroth Oct 28, 2025
74fa084
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Oct 28, 2025
421e2c5
test logging
bpkroth Oct 28, 2025
0f52e5f
partial revert to use logger instead of warn()
bpkroth Oct 28, 2025
5ddff64
comment
bpkroth Oct 28, 2025
a59c2a8
logging tweaks
bpkroth Oct 29, 2025
804bb91
adding pytest log capture and upload - even on failure
bpkroth Oct 29, 2025
ff31b4e
adding a basic healthcheck to look for readiness of the container
bpkroth Oct 29, 2025
dae49cd
pylint
bpkroth Oct 29, 2025
4b981f8
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Oct 29, 2025
fcd2266
remove
bpkroth Oct 29, 2025
3d5da84
remove special testing checks
bpkroth Oct 29, 2025
95f7bc5
cleanup and consolidation
bpkroth Oct 29, 2025
586472f
add unique artifact names
bpkroth Oct 29, 2025
8e2af72
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Oct 29, 2025
938d53f
partial revert of launcher.py log format changes
bpkroth Oct 29, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 15 additions & 7 deletions .github/workflows/devcontainer.yml
Original file line number Diff line number Diff line change
Expand Up @@ -208,13 +208,6 @@ jobs:
# Now actually run the tests.
docker exec --user vscode --env USER=vscode mlos-devcontainer make CONDA_INFO_LEVEL=-v test

- name: Upload coverage report as build artifact
if: always() # ensures it runs even if tests fail
uses: actions/upload-artifact@v4
with:
name: coverage-report
path: coverage.xml

- name: Generate and test binary distribution files
timeout-minutes: 10
run: |
Expand Down Expand Up @@ -298,6 +291,21 @@ jobs:
docker tag mlos-devcontainer:latest ${{ secrets.ACR_LOGINURL }}/mlos-devcontainer:$image_tag
docker push ${{ secrets.ACR_LOGINURL }}/mlos-devcontainer:$image_tag

# ensure these run even if tests fail, but only after everything else is done or skipped
- name: Upload coverage report as build artifact
if: always()
uses: actions/upload-artifact@v5
with:
name: coverage-report
path: coverage.xml

- name: Upload pytest logs
if: always()
uses: actions/upload-artifact@v5
with:
name: devcontainer-pytest-logs
path: logs/pytest*.log
retention-days: 7

PublishDocs:
name: Publish Documentation
Expand Down
8 changes: 8 additions & 0 deletions .github/workflows/linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -181,3 +181,11 @@ jobs:

- name: Generate and test binary distribution files
run: make CONDA_ENV_NAME=$CONDA_ENV_NAME CONDA_INFO_LEVEL=-v dist dist-test

- name: Upload pytest logs
if: always()
uses: actions/upload-artifact@v5
with:
name: linux-${{ matrix.python_version }}-pytest-logs
path: logs/pytest*.log
retention-days: 7
7 changes: 7 additions & 0 deletions .github/workflows/macos.yml
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,13 @@ jobs:
- name: Generate and test binary distribution files
run: make CONDA_ENV_NAME=$CONDA_ENV_NAME CONDA_INFO_LEVEL=-v dist dist-test

- name: Upload pytest logs
if: always()
uses: actions/upload-artifact@v5
with:
name: macos-pytest-logs
path: logs/pytest*.log
retention-days: 7

MacOSDevContainerBuildTest:
name: MacOS DevContainer Build/Test
Expand Down
7 changes: 7 additions & 0 deletions .github/workflows/windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,13 @@ jobs:
run: |
.github/workflows/build-dist-test.ps1

- name: Upload pytest logs
if: always()
uses: actions/upload-artifact@v5
with:
name: windows-pytest-logs
path: logs/pytest*.log
retention-days: 7

WindowsDevContainerBuildTest:
name: Windows DevContainer Build/Test
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -171,3 +171,6 @@ build/*.build-stamp
*.duckdb
*.db.wal
*.duckdb.wal

# pytest logs
logs/pytest*.log
23 changes: 23 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# Note: This file is named conftest.py so that pytest picks it up automatically
# without the need to adjust PYTHONPATH or sys.path as much.

import logging
import os
import shutil
from tempfile import mkdtemp
Expand All @@ -15,6 +16,14 @@
import pytest
from xdist.workermanage import WorkerController

# See Also: setup.cfg and launcher.py
LOG_FMT = (
"%(asctime)s.%(msecs)03d [%(process)d][%(threadName)s] "
"[%(filename)s:%(lineno)d %(funcName)s] "
"%(levelname)s: %(message)s"
)
DATE_FMT = "%Y-%m-%d %H:%M:%S"


def is_master(config: pytest.Config) -> bool:
"""True if the code running the given pytest.config object is running in a xdist
Expand Down Expand Up @@ -56,6 +65,20 @@ def pytest_configure(config: pytest.Config) -> None:
# Add it to the config so that it can passed to the worker nodes.
setattr(config, "shared_temp_dir", mkdtemp())

# Configure per-worker log file.
worker_id = getattr(config, "workerinput", {}).get("workerid", "master")
os.makedirs("logs", exist_ok=True)
log_file_path = os.path.join("logs", f"pytest-{worker_id}.log")
file_handler = logging.FileHandler(log_file_path, mode="w")
if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
file_handler.setLevel(logging.DEBUG)
else:
file_handler.setLevel(logging.INFO)
# logging.basicConfig(level=file_handler.level, format=LOG_FMT, datefmt=DATE_FMT)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixme - only apply to console logger for pytest

log_formatter = logging.Formatter(fmt=LOG_FMT, datefmt=DATE_FMT)
file_handler.setFormatter(log_formatter)
logging.getLogger().addHandler(file_handler)


def pytest_configure_node(node: WorkerController) -> None:
"""Xdist hook used to inform workers of the location of the shared temp dir."""
Expand Down
8 changes: 8 additions & 0 deletions mlos_bench/mlos_bench/launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,14 @@
_LOG_FORMAT = "%(asctime)s %(filename)s:%(lineno)d %(funcName)s %(levelname)s %(message)s"
logging.basicConfig(level=_LOG_LEVEL, format=_LOG_FORMAT)

# TODO: Future PR: See Also: /conftest.py, setup.cfg
# _LOG_FORMAT = (
# "%(asctime)s.%(msecs)03d [%(process)d][%(threadName)s] "
# "[%(filename)s:%(lineno)d %(funcName)s] "
# "%(levelname)s %(message)s"
# )
# logging.basicConfig(level=_LOG_LEVEL, format=_LOG_FORMAT, datefmt="%Y-%m-%d %H:%M:%S")

_LOG = logging.getLogger(__name__)


Expand Down
77 changes: 52 additions & 25 deletions mlos_bench/mlos_bench/services/remote/ssh/ssh_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from abc import ABCMeta
from asyncio import Event as CoroEvent
from asyncio import Lock as CoroLock
from asyncio import sleep as async_sleep
from collections.abc import Callable, Coroutine
from threading import current_thread
from types import TracebackType
Expand Down Expand Up @@ -172,34 +173,60 @@ async def get_client_connection(
A tuple of (SSHClientConnection, SshClient).
"""
_LOG.debug("%s: get_client_connection: %s", current_thread().name, connect_params)
async with self._cache_lock:
connection_id = SshClient.id_from_params(connect_params)
client: None | SshClient | asyncssh.SSHClient
_, client = self._cache.get(connection_id, (None, None))
if client:
_LOG.debug("%s: Checking cached client %s", current_thread().name, connection_id)
connection = await client.connection()
if not connection:
_LOG.debug(
"%s: Removing stale client connection %s from cache.",
current_thread().name,
connection_id,
)
self._cache.pop(connection_id)
# Try to reconnect next.
else:
_LOG.debug("%s: Using cached client %s", current_thread().name, connection_id)
if connection_id not in self._cache:
_LOG.debug(
"%s: Establishing client connection to %s",
connection_id = SshClient.id_from_params(connect_params)
for i in range(3): # TODO: make the retry count configurable
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

polish

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and add tests

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and config support

try:
async with self._cache_lock:
client: None | SshClient | asyncssh.SSHClient
_, client = self._cache.get(connection_id, (None, None))
if client:
_LOG.debug(
"%s: Checking cached client %s", current_thread().name, connection_id
)
connection = await client.connection()
if not connection:
_LOG.debug(
"%s: Removing stale client connection %s from cache.",
current_thread().name,
connection_id,
)
self._cache.pop(connection_id)
# Try to reconnect next.
else:
_LOG.debug(
"%s: Using cached client %s", current_thread().name, connection_id
)
if connection_id not in self._cache:
_LOG.debug(
"%s: Establishing client connection to %s",
current_thread().name,
connection_id,
)
connection, client = await asyncssh.create_connection(
SshClient, **connect_params
)
assert isinstance(client, SshClient)
self._cache[connection_id] = (connection, client)
_LOG.debug(
"%s: Created connection to %s.", current_thread().name, connection_id
)
return self._cache[connection_id]
except ConnectionRefusedError as ex: # TODO: Add other error handling here too
_LOG.warning(
"%s: Attempt %d: Failed to connect to %s: %s",
current_thread().name,
i + 1,
connection_id,
ex,
)
connection, client = await asyncssh.create_connection(SshClient, **connect_params)
assert isinstance(client, SshClient)
self._cache[connection_id] = (connection, client)
_LOG.debug("%s: Created connection to %s.", current_thread().name, connection_id)
return self._cache[connection_id]
if i < 2: # TODO: adjust to match max range
await async_sleep(1.0) # TODO: Make this configurable
if i == 2: # TODO: adjust to match max range
_LOG.error(
"%s: Giving up connecting to %s", current_thread().name, connection_id
)
raise
raise RuntimeError("Unreachable code in get_client_connection")

def cleanup(self) -> None:
"""Closes all cached connections."""
Expand Down
51 changes: 42 additions & 9 deletions mlos_bench/mlos_bench/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"""
import filecmp
import json
import logging
import os
import shutil
import socket
Expand All @@ -24,6 +25,8 @@

from mlos_bench.util import get_class_from_name, nullable

_LOG = logging.getLogger(__name__)

ZONE_NAMES = [
# Explicit time zones.
"UTC",
Expand Down Expand Up @@ -171,11 +174,24 @@ def wait_docker_service_healthy(

def wait_docker_service_socket(docker_services: DockerServices, hostname: str, port: int) -> None:
"""Wait until a docker service is ready."""
docker_services.wait_until_responsive(
check=lambda: check_socket(hostname, port),
timeout=60.0,
pause=0.5,
)
_LOG.info("Waiting for %s:%d to become responsive", hostname, port)

def check_with_logging() -> bool:
result = check_socket(hostname, port)
if not result:
_LOG.debug("Socket check failed for %s:%d", hostname, port)
return result

try:
docker_services.wait_until_responsive(
check=check_with_logging,
timeout=60.0,
pause=0.5,
)
_LOG.info("Socket %s:%d is now responsive", hostname, port)
except Exception as e:
_LOG.error("Failed waiting for %s:%d: %s", hostname, port, e)
raise


def check_socket(host: str, port: int, timeout: float = 1.0) -> bool:
Expand All @@ -192,10 +208,27 @@ def check_socket(host: str, port: int, timeout: float = 1.0) -> bool:
-------
bool
"""
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
sock.settimeout(timeout) # seconds
result = sock.connect_ex((host, port))
return result == 0
try:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
sock.settimeout(timeout) # seconds
result = sock.connect_ex((host, port))
success = result == 0
if not success:
_LOG.debug(
"Socket connection to %s:%d failed with code %d",
host,
port,
result,
)
return success
except (OSError, TimeoutError) as e:
_LOG.debug(
"Socket check exception for %s:%d: %s",
host,
port,
e,
)
return False


def resolve_host_name(host: str) -> str | None:
Expand Down
Loading
Loading