Today I was trying to install another TCA Manager and I faced an issue wuth following error:
failed to start tca deployment service
After investigating and going through the /var/log/messages logs I saw one certification error:
2025-05-26T09:49:32.914577+00:00 photon kubelet[36502]: E0526 09:49:32.914566 36502 bootstrap.go:266] part of the existing bootstrap client certificate in /etc/kubernetes/kubelet.conf is expired: 2025-05-21 20:07:01 +0000 UTC
2025-05-26T09:49:32.914710+00:00 photon kubelet[36502]: E0526 09:49:32.914700 36502 run.go:74] “command failed” err=”failed to run Kubelet: unable to load bootstrap kubeconfig: stat /etc/kubernetes/bootstrap-kubelet.conf: no such file or directory”
Then I found this workaround on github to renew the certificates on kubelet since the command kubeadm certs renew all cannot renew this and it should be renewed manually. so the command would be like :
cd /etc/kubernetes
kubeadm config print init-defaults > InitConfiguration.yaml
kubeadm kubeconfig user --config InitConfiguration.yaml --client-name system:node-proj-control-plane > kubelet.conf
After wars to re run the deployment script in TCA you can run below command
/opt/vmware/script/deploy-tca.sh
After execution of the commands, I found out some of the pods will not be running, I had a call with VMware and they suggested to use the following scirpt to renew all the certificates
#!/bin/bash
rotate_kubeadm_certs(){
URL=http://100.102.1.1:8080/api/v1alpha1/certs
response=$(curl -s -w "\n%{http_code}" -X DELETE $URL)
http_code=$(tail -n1 <<< "$response")
echo "cert rotation response code $http_code"
if [ "$http_code" != "200" ]; then
content=$(sed '$ d' <<< "$response")
echo "cert rotation failed with error $content"
else
echo "successfully rotated the certs"
certs_expiry_after=$(kubeadm certs check-expiration)
echo "certs expiry after rotation $certs_expiry_after"
fi
}
check_kubeadm_cert_expiry() {
# https://github.com/kubernetes/apimachinery/blob/v0.31.4/pkg/util/duration/duration.go#L48
expired_certs=$(kubeadm certs check-expiration | grep -i 'invalid')
if [[ -n "$expired_certs" ]]; then
return 0
else
return 1
fi
}
backup_kubelet_certs() {
echo "Backing up kubelet certificates..."
mkdir -p /home/admin/kubelet-certs-backup
# Move the current kubelet config and certificate files to backup folder
mv /etc/kubernetes/kubelet.conf /home/admin/kubelet-certs-backup/
cp /var/lib/kubelet/pki/kubelet-client* /home/admin/kubelet-certs-backup/
}
create_kubeadm_config() {
echo "Creating kubeadm config..."
cat << EOF > /home/admin/kubeadm.config
apiServer:
extraArgs:
authorization-mode: Node,RBAC
profiling: "false"
timeoutForControlPlane: 4m0s
apiVersion: kubeadm.k8s.io/v1beta3
certificatesDir: /etc/kubernetes/pki
clusterName: photon
controlPlaneEndpoint: 100.102.1.1:6443
controllerManager:
extraArgs:
leader-elect: "false"
node-cidr-mask-size-ipv6: "112"
profiling: "false"
dns:
imageRepository: projects.registry.vmware.com/tkg
imageTag: v1.10.1_vmware.2
etcd:
local:
dataDir: /common/vmware/snc/etcd
extraArgs:
cipher-suites: TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384
imageRepository: projects.registry.vmware.com/tkg
imageTag: v3.5.7_vmware.2
imageRepository: projects.registry.vmware.com/tkg
kind: ClusterConfiguration
kubernetesVersion: v1.26.5+vmware.2
networking:
dnsDomain: cluster.local
podSubnet: 100.100.0.0/16,2001:db8:1::/112
serviceSubnet: 100.101.0.0/16,2001:db8:2::/112
scheduler:
extraArgs:
leader-elect: "false"
profiling: "false"
EOF
}
generate_kubelet_config() {
echo "Generating kubelet config..."
kubeadm kubeconfig user --org system:nodes --client-name system:node:photon --config /home/admin/kubeadm.config > /etc/kubernetes/kubelet.conf
}
restart_kubelet() {
echo "Restarting kubelet..."
systemctl restart kubelet
}
update_kubelet_config() {
echo "Updating kubelet config with new certificate paths..."
sed -i 's|client-certificate: .*$|client-certificate: /var/lib/kubelet/pki/kubelet-client-current.pem|' /etc/kubernetes/kubelet.conf
sed -i 's|client-key: .*$|client-key: /var/lib/kubelet/pki/kubelet-client-current.pem|' /etc/kubernetes/kubelet.conf
}
check_node_readiness() {
echo "Checking node readiness..."
while true; do
STATUS=$(kubectl get nodes photon --kubeconfig=/home/admin/.kube/config -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}')
if [ "$STATUS" == "True" ]; then
echo "Node is up (Ready)."
break
else
echo "Node is not ready or down."
fi
sleep 5
done
}
check_tcxproduct_readiness() {
# Give 2mins for tcx installer to create tcxproduct installer certs
echo "Waiting until tcx installer to create tcxproduct"
sleep 120
echo "Checking tcxproduct resource readiness..."
START_TIME=$(date +%s)
while true; do
CURRENT_TIME=$(date +%s)
ELAPSED_TIME=$((CURRENT_TIME - START_TIME))
if [ $ELAPSED_TIME -ge 600 ]; then
echo "Timeout reached. Not all tcxproduct resources are ready."
break
fi
NOT_READY=$(kubectl get tcxproduct --all-namespaces --kubeconfig=/home/admin/.kube/config -o json | jq '.items[] | select(.status.conditions[] | select(.type == "Ready" and .status != "True")) | .metadata.name')
RESOURCE_COUNT=$(kubectl get tcxproduct --all-namespaces --kubeconfig=/home/admin/.kube/config -o json | jq '.items | length')
if [ [ -z "$NOT_READY" && "$RESOURCE_COUNT" -eq 2 ] ]; then
echo "All tcxproduct resources are ready!"
break
else
echo "Some tcxproduct resources are not ready:"
echo "$NOT_READY"
echo "Waiting for 5 seconds before checking again..."
fi
sleep 5
done
}
main() {
# Step 1: Backup existing kubelet certificates
backup_kubelet_certs
# Step 2: Create kubeadm config file
create_kubeadm_config
# Step 3: Generate kubelet config using kubeadm
generate_kubelet_config
# Step 4: Restart the kubelet
restart_kubelet
# Step 5: Update kubelet.conf with new cert paths
update_kubelet_config
# Step 6: Restart the kubelet again after updating the config
restart_kubelet
# Step 7: Update Kubeadm certs if expired
cert_expired=$(check_kubeadm_cert_expiry; echo $?)
if [[ "cert_expired" -eq 0 ]]; then
echo "Kubeadm certs are expired. Renewing them"
rotate_kubeadm_certs
else
echo "Kubeadm certs are active"
fi
# Step 8: Ensure the node becomes Ready
check_node_readiness
# Step 9: Ensure all tcxproduct resources are reconciled and ready
check_tcxproduct_readiness
}
main