openova/core/pool-domain-manager/cmd/pdm/main.go
e3mrah af4ed5ed94
fix(pdm/dynadot): auto-register NS glue records before set_ns (#1496)
Dynadot rejects set_ns when any NS hostname is not yet registered
as a glue record in the customer's account. The 31-line code comment
above SetNameservers documents this requirement but the implementation
never landed at the adapter layer — only the per-request handler-side
glueIP path (BYO Flow B, issue #900) registered glue, leaving the
mothership parent-domain onboard flow exposed.

Live blocker on 2026-05-15: founder attempted zero-touch onboard of
fresh parent domain omani.homes; the flow stalled because
ns3.openova.io had never been registered as a Dynadot glue record on
this account (ns1/ns2 had been registered long ago when openova.io
itself was onboarded). Failure surface:
  "'ns3.openova.io' needs to be registered with an ip address before
   it can be used."
Required out-of-band manual API calls to unblock, defeating the
zero-touch property the architecture is supposed to deliver.

Fix (adapter layer, no per-request flag, always-on when configured):
- Adapter gains NSGlueIP field; SetNameservers iterates every NS
  hostname BEFORE set_ns, skips in-bailiwick children of the domain
  being set, calls RegisterGlueRecord(host, NSGlueIP) for the rest.
- RegisterGlueRecord (already idempotent per issue #900) short-
  circuits via get_ns on identical IP, falls through to set_ns_ip
  on a stale IP, and runs register_ns when the host is missing — so
  a SetNameservers retry costs only get_ns probes, not extra writes.
- A typed registrar error inside the register loop returns
  immediately without calling set_ns (fail-fast contract).
- POOL_DOMAIN_MANAGER_NS_GLUE_IP env var (canonical operator-config
  pattern in this repo) threaded through cmd/pdm/main.go onto the
  Dynadot adapter at PDM startup. Empty value preserves prior
  pass-through behaviour, keeping BYO Flow B handler-level glue
  authoritative for per-request Sovereign add-domain calls.

Tests (httptest server, 7 new cases) cover:
  - AllFresh: 3 NS hostnames, all unregistered → 3× (get_ns+register_ns)
    + set_ns (7 API calls, in order).
  - OneAlreadyRegistered: middle NS short-circuits via get_ns,
    others register, set_ns runs.
  - RegisterFails_SetNsNotCalled: 429 mid-register surfaces
    ErrRateLimited unwrapped; set_ns must NOT execute.
  - SetNsFailsAfterRegister: pre-register completes, set_ns
    returns Dynadot error; ErrDomainNotInAccount surfaces.
  - SkipsInBailiwick: in-bailiwick NS hostname (child of domain
    being set) is skipped entirely (no get_ns, no register_ns).
  - DisabledWhenNSGlueIPEmpty: backward-compat — bare SetNameservers
    issues exactly one set_ns call when env var unset.
  - IsInBailiwickHost: case- and trailing-dot-tolerant table test.

go build ./... and go test ./... both green across the entire
core/pool-domain-manager module.

Co-authored-by: hatiyildiz <hatice.yildiz@openova.io>
2026-05-15 13:32:49 +04:00

290 lines
10 KiB
Go

// Command pdm — pool-domain-manager service entrypoint.
//
// Wires CNPG/Postgres (store), the PowerDNS Authoritative REST client
// (pdns), the registrar adapters (#170), and the chi-based HTTP router.
// At startup it bootstraps every managed pool zone in PowerDNS so /reserve
// can issue NS-delegation records into a parent zone that exists.
//
// All configuration is read from environment variables — per
// docs/INVIOLABLE-PRINCIPLES.md #4 nothing here is hardcoded:
//
// PORT — listen port (default 8080)
// PDM_DATABASE_URL — postgres DSN, REQUIRED
// PDM_PDNS_BASE_URL — PowerDNS REST API base URL, REQUIRED
// (e.g. http://powerdns.openova-system.svc.cluster.local:8081)
// PDM_PDNS_API_KEY — PowerDNS X-API-Key header value, REQUIRED
// PDM_PDNS_SERVER_ID — PowerDNS server identifier, default "localhost"
// PDM_NAMESERVERS — comma-separated FQDNs for child-zone NS RRsets and
// parent NS delegation records, default
// "ns1.openova.io,ns2.openova.io,ns3.openova.io"
// DYNADOT_MANAGED_DOMAINS — comma-separated managed pool list (for /check
// gating + parent-zone bootstrap)
// DYNADOT_DOMAIN — legacy single-domain fallback
// DYNADOT_API_KEY — kept for the registrar adapter (#170 BYO flow)
// DYNADOT_API_SECRET — kept for the registrar adapter (#170 BYO flow)
// POOL_DOMAIN_MANAGER_NS_GLUE_IP — IPv4 of the mothership PowerDNS LB. When
// set, the Dynadot registrar adapter pre-
// registers every out-of-bailiwick NS hostname
// against the customer's Dynadot account before
// set_ns, fixing the parent-domain onboard
// flow that previously failed on Dynadot's
// "'ns3.openova.io' needs to be registered
// with an ip address" rejection (issue #1500).
// PDM_RESERVATION_TTL — go duration string, default "10m"
// PDM_SWEEPER_INTERVAL — go duration string, default "30s"
// PDM_LOG_LEVEL — debug | info | warn | error (default info)
package main
import (
"context"
"errors"
"log/slog"
"net/http"
"os"
"os/signal"
"strings"
"syscall"
"time"
"github.com/go-chi/chi/v5"
"github.com/go-chi/chi/v5/middleware"
"github.com/openova-io/openova/core/pool-domain-manager/internal/allocator"
"github.com/openova-io/openova/core/pool-domain-manager/internal/dynadot"
"github.com/openova-io/openova/core/pool-domain-manager/internal/handler"
"github.com/openova-io/openova/core/pool-domain-manager/internal/pdns"
registrar "github.com/openova-io/openova/core/pool-domain-manager/internal/registrar"
regCloudflare "github.com/openova-io/openova/core/pool-domain-manager/internal/registrar/cloudflare"
regDynadot "github.com/openova-io/openova/core/pool-domain-manager/internal/registrar/dynadot"
regGoDaddy "github.com/openova-io/openova/core/pool-domain-manager/internal/registrar/godaddy"
regNamecheap "github.com/openova-io/openova/core/pool-domain-manager/internal/registrar/namecheap"
regOVH "github.com/openova-io/openova/core/pool-domain-manager/internal/registrar/ovh"
"github.com/openova-io/openova/core/pool-domain-manager/internal/store"
)
func main() {
log := newLogger(env("PDM_LOG_LEVEL", "info"))
slog.SetDefault(log)
cfg, err := loadConfig()
if err != nil {
log.Error("config load failed", "err", err)
os.Exit(2)
}
ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
defer cancel()
startCtx, startCancel := context.WithTimeout(ctx, 30*time.Second)
defer startCancel()
s, err := store.New(startCtx, cfg.DatabaseURL)
if err != nil {
log.Error("postgres connect failed", "err", err)
os.Exit(1)
}
defer s.Close()
pdnsClient := pdns.New(cfg.PDNSBaseURL, cfg.PDNSServerID, cfg.PDNSAPIKey)
alloc := allocator.New(s, pdnsClient, log, allocator.Config{
Nameservers: cfg.Nameservers,
ReservationTTL: cfg.ReservationTTL,
})
// Bootstrap every managed pool zone before HTTP serves traffic. /reserve
// requires the parent zone to exist so the NS-delegation RRset has
// somewhere to land. Per docs/PLATFORM-POWERDNS.md the parent zone is
// authoritative for the OpenOva pool (e.g. `omani.works`) and signs
// the DS records that anchor each Sovereign's DNSSEC chain.
bootstrapCtx, bootstrapCancel := context.WithTimeout(ctx, 60*time.Second)
if err := alloc.BootstrapParentZones(bootstrapCtx, dynadot.ManagedDomains()); err != nil {
bootstrapCancel()
log.Error("parent-zone bootstrap failed",
"managedDomains", dynadot.ManagedDomains(),
"err", err)
os.Exit(1)
}
bootstrapCancel()
go alloc.RunSweeper(ctx, cfg.SweeperInterval)
h := handler.New(alloc, s, log)
// Build the registrar registry: every adapter wires up unconditionally
// because the customer's API token is supplied per request, not at
// service-start. Disabling an adapter would only mean omitting it from
// the map; today we ship all 5.
//
// Dynadot adapter is constructed with NSGlueIP from
// POOL_DOMAIN_MANAGER_NS_GLUE_IP (when set) so SetNameservers can
// pre-register every out-of-bailiwick NS hostname against the
// customer's account before set_ns. This unblocks the mothership
// parent-domain onboard flow for fresh Dynadot domains that haven't
// yet had ns1/ns2/ns3.openova.io registered as glue records (issue
// #1500, 2026-05-15). Empty value → adapter falls back to its
// pre-fix behaviour and the handler-level glueIP path (BYO Flow B)
// remains authoritative for per-request glue.
dynadotAdapter := regDynadot.New()
dynadotAdapter.NSGlueIP = strings.TrimSpace(os.Getenv("POOL_DOMAIN_MANAGER_NS_GLUE_IP"))
reg := registrar.Registry{
regCloudflare.New().Name(): regCloudflare.New(),
regGoDaddy.New().Name(): regGoDaddy.New(),
regNamecheap.New().Name(): regNamecheap.New(),
regOVH.New().Name(): regOVH.New(),
dynadotAdapter.Name(): dynadotAdapter,
}
h.SetRegistry(reg)
log.Info("registrar adapters wired",
"registrars", reg.Names(),
"dynadotGlueAutoRegister", dynadotAdapter.NSGlueIP != "",
)
root := chi.NewRouter()
root.Use(middleware.RequestID)
root.Use(middleware.RealIP)
root.Use(middleware.Logger)
root.Use(middleware.Recoverer)
root.Mount("/", h.Routes())
srv := &http.Server{
Addr: ":" + cfg.Port,
Handler: root,
ReadHeaderTimeout: 10 * time.Second,
ReadTimeout: 30 * time.Second,
WriteTimeout: 30 * time.Second,
IdleTimeout: 2 * time.Minute,
}
// Surface the managed-domain list at startup so operators can grep logs
// for misconfiguration (e.g. typo in the secret's `domains` key).
log.Info("pool-domain-manager starting",
"port", cfg.Port,
"reservationTTL", cfg.ReservationTTL.String(),
"sweeperInterval", cfg.SweeperInterval.String(),
"managedDomains", dynadot.ManagedDomains(),
"nameservers", cfg.Nameservers,
"pdnsBaseURL", cfg.PDNSBaseURL,
"pdnsServerID", cfg.PDNSServerID,
)
go func() {
if err := srv.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) {
log.Error("http server failed", "err", err)
os.Exit(1)
}
}()
<-ctx.Done()
log.Info("shutdown signal received, draining")
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 20*time.Second)
defer shutdownCancel()
if err := srv.Shutdown(shutdownCtx); err != nil {
log.Error("graceful shutdown failed", "err", err)
os.Exit(1)
}
log.Info("shutdown complete")
}
// config bundles the runtime configuration so loadConfig can return a single
// struct + error.
type config struct {
Port string
DatabaseURL string
PDNSBaseURL string
PDNSAPIKey string
PDNSServerID string
Nameservers []string
ReservationTTL time.Duration
SweeperInterval time.Duration
}
func loadConfig() (*config, error) {
c := &config{
Port: env("PORT", "8080"),
}
c.DatabaseURL = strings.TrimSpace(os.Getenv("PDM_DATABASE_URL"))
if c.DatabaseURL == "" {
return nil, errors.New("PDM_DATABASE_URL is required")
}
c.PDNSBaseURL = strings.TrimSpace(os.Getenv("PDM_PDNS_BASE_URL"))
if c.PDNSBaseURL == "" {
return nil, errors.New("PDM_PDNS_BASE_URL is required")
}
c.PDNSAPIKey = strings.TrimSpace(os.Getenv("PDM_PDNS_API_KEY"))
if c.PDNSAPIKey == "" {
return nil, errors.New("PDM_PDNS_API_KEY is required")
}
c.PDNSServerID = strings.TrimSpace(env("PDM_PDNS_SERVER_ID", "localhost"))
nsRaw := strings.TrimSpace(os.Getenv("PDM_NAMESERVERS"))
if nsRaw == "" {
// Default per docs/PLATFORM-POWERDNS.md — these are the canonical
// NS endpoints documented for the OpenOva fleet. Configurable via
// PDM_NAMESERVERS so a Sovereign-overlay can rebadge.
nsRaw = "ns1.openova.io,ns2.openova.io,ns3.openova.io"
}
c.Nameservers = parseNameservers(nsRaw)
if len(c.Nameservers) == 0 {
return nil, errors.New("PDM_NAMESERVERS contained no valid hostnames")
}
ttlStr := env("PDM_RESERVATION_TTL", "10m")
ttl, err := time.ParseDuration(ttlStr)
if err != nil {
return nil, errors.New("PDM_RESERVATION_TTL is not a valid duration: " + err.Error())
}
c.ReservationTTL = ttl
swStr := env("PDM_SWEEPER_INTERVAL", "30s")
sw, err := time.ParseDuration(swStr)
if err != nil {
return nil, errors.New("PDM_SWEEPER_INTERVAL is not a valid duration: " + err.Error())
}
c.SweeperInterval = sw
return c, nil
}
func env(key, fallback string) string {
if v := os.Getenv(key); v != "" {
return v
}
return fallback
}
func parseNameservers(raw string) []string {
raw = strings.ReplaceAll(raw, ",", " ")
parts := strings.Fields(raw)
out := make([]string, 0, len(parts))
seen := make(map[string]struct{}, len(parts))
for _, p := range parts {
p = strings.ToLower(strings.TrimSpace(p))
if p == "" {
continue
}
if _, dup := seen[p]; dup {
continue
}
seen[p] = struct{}{}
out = append(out, p)
}
return out
}
func newLogger(level string) *slog.Logger {
var lvl slog.Level
switch strings.ToLower(level) {
case "debug":
lvl = slog.LevelDebug
case "warn":
lvl = slog.LevelWarn
case "error":
lvl = slog.LevelError
default:
lvl = slog.LevelInfo
}
return slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{Level: lvl}))
}