Press Space or click the green arrow icons to navigate the slides ->
Data Science / Machine Learning batch jobs:
Data Science / Machine Learning batch jobs:
Scaling (DS) infrastructure.
AWS ECS
AWS Batch
Kubernetes
Source: xkcd
Other use-cases:
Source: sparecores.com
>>> from rich import print as pp
>>> from sc_crawler.tables import Server
>>> from sqlmodel import create_engine, Session, select
>>> engine = create_engine("sqlite:///sc-data-all.db")
>>> session = Session(engine)
>>> server = session.exec(select(Server).where(Server.server_id == 'g4dn.xlarge')).one()
>>> pp(server)
Server(
server_id='g4dn.xlarge',
vendor_id='aws',
display_name='g4dn.xlarge',
api_reference='g4dn.xlarge',
name='g4dn.xlarge',
family='g4dn',
description='Graphics intensive [Instance store volumes] [Network and EBS optimized] Gen4 xlarge',
status=<Status.ACTIVE: 'active'>,
observed_at=datetime.datetime(2024, 6, 6, 10, 18, 4, 127254),
hypervisor='nitro',
vcpus=4,
cpu_cores=2,
cpu_allocation=<CpuAllocation.DEDICATED: 'Dedicated'>,
cpu_manufacturer='Intel',
cpu_family='Xeon',
cpu_model='8259CL',
cpu_architecture=<CpuArchitecture.X86_64: 'x86_64'>,
cpu_speed=3.5,
cpu_l1_cache=None,
cpu_l2_cache=None,
cpu_l3_cache=None,
cpu_flags=[],
memory_amount=16384,
memory_generation=<DdrGeneration.DDR4: 'DDR4'>,
memory_speed=3200,
memory_ecc=None,
gpu_count=1,
gpu_memory_min=16384,
gpu_memory_total=16384,
gpu_manufacturer='Nvidia',
gpu_family='Turing',
gpu_model='Tesla T4',
gpus=[
{
'manufacturer': 'Nvidia',
'family': 'Turing',
'model': 'Tesla T4',
'memory': 15360,
'firmware_version': '535.171.04',
'bios_version': '90.04.96.00.A0',
'graphics_clock': 1590,
'sm_clock': 1590,
'mem_clock': 5001,
'video_clock': 1470
}
],
storage_size=125,
storage_type=<StorageType.NVME_SSD: 'nvme ssd'>,
storages=[{'size': 125, 'storage_type': 'nvme ssd'}],
network_speed=5.0,
inbound_traffic=0.0,
outbound_traffic=0.0,
ipv4=0,
)
Source: dbdocs.io/spare-cores
Need to optionally track price etc. changes.
Let’s update the cpu_cores
column to be optional, as
some vendors as shy sharing that over their APIs. We will backfill with
the Spare Cores Inspector!
"""v0.1.1 cores optional
Revision ID: 4691089690c2
Revises: 98894dffd37c
Create Date: 2024-04-10 00:59:03.509522
"""
from typing import Sequence, Union
import sqlalchemy as sa
import sqlmodel
from alembic import op
# revision identifiers, used by Alembic.
revision: str = "4691089690c2"
down_revision: Union[str, None] = "98894dffd37c"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
## need to provide the table schema for offline mode support
meta = sa.MetaData()
server_table = sa.Table(
"server_scd" if op.get_context().config.attributes.get("scd") else "server",
meta,
sa.Column(
"vendor_id",
sqlmodel.sql.sqltypes.AutoString(),
nullable=False,
),
sa.Column(
"server_id",
sqlmodel.sql.sqltypes.AutoString(),
nullable=False,
),
sa.Column(
"name",
sqlmodel.sql.sqltypes.AutoString(),
nullable=False,
),
sa.Column(
"vcpus",
sa.Integer(),
nullable=False,
),
sa.Column(
"hypervisor",
sqlmodel.sql.sqltypes.AutoString(),
nullable=True,
),
sa.Column(
"cpu_allocation",
sa.Enum("SHARED", "BURSTABLE", "DEDICATED", name="cpuallocation"),
nullable=False,
),
sa.Column(
"cpu_cores",
sa.Integer(),
nullable=False,
),
sa.Column(
"cpu_speed",
sa.Float(),
nullable=True,
),
sa.Column(
"cpu_architecture",
sa.Enum(
"ARM64",
"ARM64_MAC",
"I386",
"X86_64",
"X86_64_MAC",
name="cpuarchitecture",
),
nullable=False,
),
sa.Column(
"cpu_manufacturer",
sqlmodel.sql.sqltypes.AutoString(),
nullable=True,
),
sa.Column(
"cpu_family",
sqlmodel.sql.sqltypes.AutoString(),
nullable=True,
),
sa.Column(
"cpu_model",
sqlmodel.sql.sqltypes.AutoString(),
nullable=True,
),
sa.Column(
"cpus",
sa.JSON(),
nullable=False,
),
sa.Column("memory", sa.Integer(), nullable=False),
sa.Column(
"gpu_count",
sa.Integer(),
nullable=False,
),
sa.Column(
"gpu_memory_min",
sa.Integer(),
nullable=True,
),
sa.Column(
"gpu_memory_total",
sa.Integer(),
nullable=True,
),
sa.Column(
"gpu_manufacturer",
sqlmodel.sql.sqltypes.AutoString(),
nullable=True,
),
sa.Column(
"gpu_model",
sqlmodel.sql.sqltypes.AutoString(),
nullable=True,
),
sa.Column(
"gpus",
sa.JSON(),
nullable=False,
),
sa.Column(
"storage_size",
sa.Integer(),
nullable=False,
),
sa.Column(
"storage_type",
sa.Enum("HDD", "SSD", "NVME_SSD", "NETWORK", name="storagetype"),
nullable=True,
),
sa.Column(
"storages",
sa.JSON(),
nullable=False,
),
sa.Column(
"network_speed",
sa.Float(),
nullable=True,
),
sa.Column(
"inbound_traffic",
sa.Float(),
nullable=False,
),
sa.Column(
"outbound_traffic",
sa.Float(),
nullable=False,
),
sa.Column(
"ipv4",
sa.Integer(),
nullable=False,
),
sa.Column(
"status",
sa.Enum("ACTIVE", "INACTIVE", name="status"),
nullable=False,
),
sa.Column(
"observed_at",
sa.DateTime(),
nullable=False,
),
sa.ForeignKeyConstraint(
["vendor_id"],
["vendor.vendor_id"],
),
sa.PrimaryKeyConstraint("vendor_id", "server_id", "observed_at")
if op.get_context().config.attributes.get("scd")
else sa.PrimaryKeyConstraint("vendor_id", "server_id"),
)
def upgrade() -> None:
if op.get_context().config.attributes.get("scd"):
with op.batch_alter_table(
"server_scd", schema=None, copy_from=server_table
) as batch_op:
batch_op.alter_column(
"cpu_cores", existing_type=sa.INTEGER(), nullable=True
)
else:
with op.batch_alter_table(
"server", schema=None, copy_from=server_table
) as batch_op:
batch_op.alter_column(
"cpu_cores", existing_type=sa.INTEGER(), nullable=True
)
def downgrade() -> None:
if op.get_context().config.attributes.get("scd"):
with op.batch_alter_table(
"server_scd", schema=None, copy_from=server_table
) as batch_op:
batch_op.alter_column(
"cpu_cores", existing_type=sa.INTEGER(), nullable=False
)
else:
with op.batch_alter_table(
"server", schema=None, copy_from=server_table
) as batch_op:
batch_op.alter_column(
"cpu_cores", existing_type=sa.INTEGER(), nullable=False
)
CREATE TABLE _alembic_tmp_server (
vendor_id VARCHAR NOT NULL,
server_id VARCHAR NOT NULL,
name VARCHAR NOT NULL,
vcpus INTEGER NOT NULL,
hypervisor VARCHAR,
cpu_allocation VARCHAR(9) NOT NULL,
cpu_cores INTEGER,
cpu_speed FLOAT,
cpu_architecture VARCHAR(10) NOT NULL,
cpu_manufacturer VARCHAR,
cpu_family VARCHAR,
cpu_model VARCHAR,
cpus JSON NOT NULL,
memory INTEGER NOT NULL,
gpu_count INTEGER NOT NULL,
gpu_memory_min INTEGER,
gpu_memory_total INTEGER,
gpu_manufacturer VARCHAR,
gpu_model VARCHAR,
gpus JSON NOT NULL,
storage_size INTEGER NOT NULL,
storage_type VARCHAR(8),
storages JSON NOT NULL,
network_speed FLOAT,
inbound_traffic FLOAT NOT NULL,
outbound_traffic FLOAT NOT NULL,
ipv4 INTEGER NOT NULL,
status VARCHAR(8) NOT NULL,
observed_at DATETIME NOT NULL,
description VARCHAR,
PRIMARY KEY (vendor_id, server_id),
FOREIGN KEY(vendor_id) REFERENCES vendor (vendor_id)
);
INSERT INTO _alembic_tmp_server (vendor_id, server_id, name, vcpus, hypervisor, cpu_allocation, cpu_cores, cpu_speed, cpu_architecture, cpu_manufacturer, cpu_family, cpu_model, cpus, memory, gpu_count, gpu_memory_min, gpu_memory_total, gpu
_manufacturer, gpu_model, gpus, storage_size, storage_type, storages, network_speed, inbound_traffic, outbound_traffic, ipv4, status, observed_at) SELECT server.vendor_id, server.server_id, server.name, server.vcpus, server.hypervisor, ser
ver.cpu_allocation, server.cpu_cores, server.cpu_speed, server.cpu_architecture, server.cpu_manufacturer, server.cpu_family, server.cpu_model, server.cpus, server.memory, server.gpu_count, server.gpu_memory_min, server.gpu_memory_total, se
rver.gpu_manufacturer, server.gpu_model, server.gpus, server.storage_size, server.storage_type, server.storages, server.network_speed, server.inbound_traffic, server.outbound_traffic, server.ipv4, server.status, server.observed_at
FROM server;
DROP TABLE server;
ALTER TABLE _alembic_tmp_server RENAME TO server;
UPDATE alembic_version SET version_num='4691089690c2' WHERE alembic_version.version_num = '98894dffd37c';
eu-west-1
Europe (Ireland)
EU (Ireland)
eu-west-1
Dublin (IE)
f1-micro
is one out of 2 instances with simple
pricing.
C2D
.c2
, which is called “Compute
optimized”.m2
is actually priced at a premium on the top of
m1
.n1
resource group is not CPU/RAM, but
N1Standard
, extract if it’s CPU or RAM price from
description.Source: dbhub.io/sparecores
Information collected from vendor APIs is very limited, so we run:
dmidecode
lscpu
lshw
bw_mem
stress-ng
Data is collected in public: sc-inspector-data
repo on
GitHub.
docker run --rm -ti -v /var/run/docker.sock:/var/run/docker.sock \
-e GITHUB_TOKEN=${GITHUB_TOKEN} \
-e BENCHMARK_SECRETS_PASSPHRASE=${BENCHMARK_SECRETS_PASSPHRASE} \
ghcr.io/sparecores/sc-inspector:main \
inspect --vendor ${VENDOR} --instance ${INSTANCE} --gpu-count ${GPU_COUNT}
$ docker run --rm -ti \
ghcr.io/sparecores/sc-runner:main \
create aws --instance t4g.nano
Updating (aws.us-west-2.None.t4g.nano):
pulumi:pulumi:Stack runner-aws.us-west-2.None.t4g.nano running
+ pulumi:providers:aws us-west-2 creating (0s)
@ updating....
+ pulumi:providers:aws us-west-2 created (0.29s)
+ aws:ec2:SecurityGroup t4g.nano creating (0s)
@ updating.....
+ aws:ec2:SecurityGroup t4g.nano created (2s)
@ updating....
+ aws:vpc:SecurityGroupIngressRule t4g.nano-0 creating (0s)
+ aws:vpc:SecurityGroupIngressRule t4g.nano-1 creating (0s)
+ aws:ec2:Instance t4g.nano creating (0s)
+ aws:vpc:SecurityGroupEgressRule t4g.nano-1 creating (0s)
+ aws:vpc:SecurityGroupEgressRule t4g.nano-0 creating (0s)
@ updating....
+ aws:vpc:SecurityGroupIngressRule t4g.nano-0 created (1s)
+ aws:vpc:SecurityGroupIngressRule t4g.nano-1 created (1s)
+ aws:vpc:SecurityGroupEgressRule t4g.nano-1 created (1s)
@ updating....
+ aws:vpc:SecurityGroupEgressRule t4g.nano-0 created (1s)
@ updating..............
+ aws:ec2:Instance t4g.nano created (13s)
@ updating....
pulumi:pulumi:Stack runner-aws.us-west-2.None.t4g.nano
Resources:
+ 7 created
1 unchanged
$ curl https://keeper.sparecores.net/server/aws/g4dn.xlarge | jq
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
100 111k 100 111k 0 0 79795 0 0:00:01 0:00:01 --:--:-- 79799
{
"vendor_id": "aws",
"server_id": "g4dn.xlarge",
"name": "g4dn.xlarge",
"api_reference": "g4dn.xlarge",
"display_name": "g4dn.xlarge",
"description": "Graphics intensive [Instance store volumes] [Network and EBS optimized] Gen4 xlarge",
"family": "g4dn",
"vcpus": 4,
"hypervisor": "nitro",
"cpu_allocation": "Dedicated",
"cpu_cores": 2,
"cpu_speed": 3.5,
"cpu_architecture": "x86_64",
"cpu_manufacturer": "Intel",
"cpu_family": "Xeon",
"cpu_model": "8259CL",
"cpu_l1_cache": 131072,
"cpu_l2_cache": 2097152,
"cpu_l3_cache": 37486592,
"cpu_flags": [
"fpu",
"vme",
"de",
"pse",
"tsc",
"msr",
"pae",
...
],
"cpus": [],
"memory_amount": 16384,
"memory_generation": "DDR4",
"memory_speed": 2933,
"memory_ecc": null,
"gpu_count": 1,
"gpu_memory_min": 16384,
"gpu_memory_total": 16384,
"gpu_manufacturer": "Nvidia",
"gpu_family": "Turing",
"gpu_model": "Tesla T4",
"gpus": [
{
"manufacturer": "Nvidia",
"family": "Turing",
"model": "Tesla T4",
"memory": 15360,
"firmware_version": "535.171.04",
"bios_version": "90.04.96.00.A0",
"graphics_clock": 1590,
"sm_clock": 1590,
"mem_clock": 5001,
"video_clock": 1470
}
],
"storage_size": 125,
"storage_type": "nvme ssd",
"storages": [
{
"size": 125,
"storage_type": "nvme ssd"
}
],
"network_speed": 5.0,
"inbound_traffic": 0.0,
"outbound_traffic": 0.0,
"ipv4": 0,
"status": "active",
"observed_at": "2024-06-09T21:20:22.005194",
"vendor": {
"logo": "https://sc-data-public-40e9d310.s3.amazonaws.com/cdn/logos/aws.svg",
"address_line": "410 Terry Ave N",
"name": "Amazon Web Services",
"zip_code": "98109",
"founding_year": 2002,
"state": "Washington",
"status_page": "https://health.aws.amazon.com/health/status",
"vendor_id": "aws",
"homepage": "https://aws.amazon.com",
"country_id": "US",
"status": "active",
"observed_at": "2024-06-09T21:50:32.658281",
"city": "Seattle"
},
"prices": [
{
"vendor_id": "aws",
"region_id": "af-south-1",
"zone_id": "afs1-az1",
"server_id": "g4dn.xlarge",
"operating_system": "Linux",
"allocation": "ondemand",
"unit": "hour",
"price": 0.698,
"price_upfront": 0.0,
"price_tiered": [],
"currency": "USD",
"status": "active",
"observed_at": "2024-06-09T21:21:10.015921",
"region": {
"country_id": "ZA",
"state": null,
"founding_year": 2020,
"green_energy": false,
"name": "Africa (Cape Town)",
"city": "Cape Town",
"status": "active",
"address_line": null,
"observed_at": "2024-06-09T21:19:37.529944",
"zip_code": null,
"lon": 18.3758801,
"region_id": "af-south-1",
"display_name": "Cape Town (ZA)",
"lat": -33.914651,
"vendor_id": "aws",
"api_reference": "af-south-1",
"aliases": []
},
"zone": {
"region_id": "af-south-1",
"zone_id": "afs1-az1",
"api_reference": "af-south-1a",
"status": "active",
"vendor_id": "aws",
"name": "af-south-1a",
"display_name": "af-south-1a",
"observed_at": "2024-06-09T21:19:40.425499"
}
},
{
"vendor_id": "aws",
"region_id": "af-south-1",
"zone_id": "afs1-az2",
"server_id": "g4dn.xlarge",
"operating_system": "Linux",
"allocation": "spot",
"unit": "hour",
"price": 0.2251,
"price_upfront": 0.0,
"price_tiered": [],
"currency": "USD",
"status": "active",
"observed_at": "2024-06-09T18:16:26",
"region": {
"country_id": "ZA",
"state": null,
"founding_year": 2020,
"green_energy": false,
"name": "Africa (Cape Town)",
"city": "Cape Town",
"status": "active",
"address_line": null,
"observed_at": "2024-06-09T21:19:37.529944",
"zip_code": null,
"lon": 18.3758801,
"region_id": "af-south-1",
"display_name": "Cape Town (ZA)",
"lat": -33.914651,
"vendor_id": "aws",
"api_reference": "af-south-1",
"aliases": []
},
"zone": {
"region_id": "af-south-1",
"zone_id": "afs1-az2",
"api_reference": "af-south-1b",
"status": "active",
"vendor_id": "aws",
"name": "af-south-1b",
"display_name": "af-south-1b",
"observed_at": "2024-06-09T21:19:40.425554"
}
},
...
],
"benchmark_scores": [
{
"server_id": "g4dn.xlarge",
"config": {},
"status": "active",
"vendor_id": "aws",
"benchmark_id": "bogomips",
"score": 5000.0,
"note": null,
"observed_at": "2024-06-07T04:26:48.643640"
},
{
"server_id": "g4dn.xlarge",
"config": {
"cores": 1,
"framework_version": "0.17.08"
},
"status": "active",
"vendor_id": "aws",
"benchmark_id": "stress_ng:cpu_all",
"score": 1385.583093,
"note": null,
"observed_at": "2024-06-07T04:27:14.552982"
},
{
"server_id": "g4dn.xlarge",
"config": {
"cores": 4,
"framework_version": "0.17.08"
},
"status": "active",
"vendor_id": "aws",
"benchmark_id": "stress_ng:cpu_all",
"score": 4013.022928,
"note": null,
"observed_at": "2024-06-07T04:27:02.508145"
}
]
}
Source: sparecores.com
@bra-fsn
@palabola
@daroczig
@bra-fsn
Infrastructure and Python veteran.
@palabola
Guardian of the front-end and Node.js tools.
@daroczig
Hack of all trades, master of NaN
.
Slides: sparecores.com/talks