From 3ce72e805fa22e8093ff55c33b657b8ce3b8b1a9 Mon Sep 17 00:00:00 2001
From: Nico Schottelius <nico@nico-notebook.schottelius.org>
Date: Sun, 4 Sep 2022 00:55:02 +0200
Subject: [PATCH] ++rook notes

---
 .../contents.lr                               | 515 +++++++++++++++++-
 1 file changed, 514 insertions(+), 1 deletion(-)

diff --git a/content/u/blog/2022-08-27-migrating-ceph-nautilus-into-kubernetes-with-rook/contents.lr b/content/u/blog/2022-08-27-migrating-ceph-nautilus-into-kubernetes-with-rook/contents.lr
index fd39cfb..896c395 100644
--- a/content/u/blog/2022-08-27-migrating-ceph-nautilus-into-kubernetes-with-rook/contents.lr
+++ b/content/u/blog/2022-08-27-migrating-ceph-nautilus-into-kubernetes-with-rook/contents.lr
@@ -460,6 +460,7 @@ In this particular cluster we have 2 pools:
 The device class "hdd-big" is specific to this cluster as it used to
 contain 2.5" and 3.5" HDDs in different pools.
 
+
 ### [old] Analysing the ceph cluster configuration
 
 Taking the view from the old cluster, the following items are
@@ -481,6 +482,77 @@ allows adding and removing resources.
 ### Analysing the rook configurations
 
 Taking the opposite view, we can also checkout a running rook cluster
+and the rook disaster recovery documentation to identify what to
+modify.
+
+Let's have a look at the secrets first:
+
+```
+cluster-peer-token-rook-ceph                 kubernetes.io/rook                    2      320d
+default-token-xm9xs                          kubernetes.io/service-account-token   3      320d
+rook-ceph-admin-keyring                      kubernetes.io/rook                    1      320d
+rook-ceph-admission-controller               kubernetes.io/tls                     3      29d
+rook-ceph-cmd-reporter-token-5mh88           kubernetes.io/service-account-token   3      320d
+rook-ceph-config                             kubernetes.io/rook                    2      320d
+rook-ceph-crash-collector-keyring            kubernetes.io/rook                    1      320d
+rook-ceph-mgr-a-keyring                      kubernetes.io/rook                    1      320d
+rook-ceph-mgr-b-keyring                      kubernetes.io/rook                    1      320d
+rook-ceph-mgr-token-ktt2m                    kubernetes.io/service-account-token   3      320d
+rook-ceph-mon                                kubernetes.io/rook                    4      320d
+rook-ceph-mons-keyring                       kubernetes.io/rook                    1      320d
+rook-ceph-osd-token-8m6lb                    kubernetes.io/service-account-token   3      320d
+rook-ceph-purge-osd-token-hznnk              kubernetes.io/service-account-token   3      320d
+rook-ceph-rgw-token-wlzbc                    kubernetes.io/service-account-token   3      134d
+rook-ceph-system-token-lxclf                 kubernetes.io/service-account-token   3      320d
+rook-csi-cephfs-node                         kubernetes.io/rook                    2      320d
+rook-csi-cephfs-plugin-sa-token-hkq2g        kubernetes.io/service-account-token   3      320d
+rook-csi-cephfs-provisioner                  kubernetes.io/rook                    2      320d
+rook-csi-cephfs-provisioner-sa-token-tb78d   kubernetes.io/service-account-token   3      320d
+rook-csi-rbd-node                            kubernetes.io/rook                    2      320d
+rook-csi-rbd-plugin-sa-token-dhhq6           kubernetes.io/service-account-token   3      320d
+rook-csi-rbd-provisioner                     kubernetes.io/rook                    2      320d
+rook-csi-rbd-provisioner-sa-token-lhr4l      kubernetes.io/service-account-token   3      320d
+```
+
+TBC
+
+### Creating additional resources after the cluster is bootstrapped
+
+To let rook know what should be there, we already create the two
+`CephBlockPool` instances that match the existing pools:
+
+```apiVersion: ceph.rook.io/v1
+kind: CephBlockPool
+metadata:
+  name: one
+  namespace: rook-ceph
+spec:
+  failureDomain: host
+  replicated:
+    size: 3
+  deviceClass: ssd
+```
+
+And for the hdd based pool:
+
+```
+apiVersion: ceph.rook.io/v1
+kind: CephBlockPool
+metadata:
+  name: hdd
+  namespace: rook-ceph
+spec:
+  failureDomain: host
+  replicated:
+    size: 3
+  deviceClass: hdd-big
+```
+
+Saving both of these in ceph-blockpools.yaml and applying it:
+
+```
+kubectl -n rook-ceph apply -f ceph-blockpools.yaml
+```
 
 ### Configuring ceph after the operator deployment
 
@@ -526,6 +598,29 @@ changes. Important to note is that we use the ceph image version
 v14.2.21, which is the same version as the native cluster.
 
 
+
+### rook v1.8 is incompatible with ceph nautilus
+
+After deploying the rook operator, the following error message is
+printed in its logs:
+
+```
+2022-09-03 15:14:03.543925 E | ceph-cluster-controller: failed to reconcile CephCluster "rook-ceph/rook-ceph". failed to reconcile cluster "rook-ceph": failed to configure local ceph cluster: failed the ceph version check: the version does not meet the minimum version "15.2.0-0 octopus"
+```
+
+So we need to downgrade to rook v1.7. Using `helm search repo
+rook/rook-ceph --versions` we identify the latest usable version
+should be `v1.7.11`.
+
+We start the downgrade process using
+
+```
+helm upgrade --install --namespace rook-ceph --create-namespace --version v1.7.11 rook-ceph rook/rook-ceph
+```
+
+After downgrading the operator is starting the canary monitors and
+continues to bootstrap the cluster.
+
 ### The ceph-toolbox
 
 To be able to view the current cluster status, we also deploy the
@@ -552,7 +647,7 @@ spec:
       dnsPolicy: ClusterFirstWithHostNet
       containers:
         - name: rook-ceph-tools
-          image: rook/ceph:v1.8.10
+          image: rook/ceph:v1.7.11
           command: ["/bin/bash"]
           args: ["-m", "-c", "/usr/local/bin/toolbox.sh"]
           imagePullPolicy: IfNotPresent
@@ -593,6 +688,424 @@ spec:
           tolerationSeconds: 5
 ```
 
+### Checking the deployments
+
+After the rook-operator finished deploying, the following deployments
+are visible in kubernetes:
+
+```
+[17:25] blind:~% kubectl -n rook-ceph get deployment
+NAME                                READY   UP-TO-DATE   AVAILABLE   AGE
+csi-cephfsplugin-provisioner        2/2     2            2           21m
+csi-rbdplugin-provisioner           2/2     2            2           21m
+rook-ceph-crashcollector-server48   1/1     1            1           2m3s
+rook-ceph-crashcollector-server52   1/1     1            1           2m24s
+rook-ceph-crashcollector-server53   1/1     1            1           2m2s
+rook-ceph-crashcollector-server56   1/1     1            1           2m17s
+rook-ceph-crashcollector-server57   1/1     1            1           2m1s
+rook-ceph-mgr-a                     1/1     1            1           2m3s
+rook-ceph-mon-a                     1/1     1            1           10m
+rook-ceph-mon-b                     1/1     1            1           8m3s
+rook-ceph-mon-c                     1/1     1            1           5m55s
+rook-ceph-mon-d                     1/1     1            1           5m33s
+rook-ceph-mon-e                     1/1     1            1           4m32s
+rook-ceph-operator                  1/1     1            1           102m
+rook-ceph-tools                     1/1     1            1           17m
+```
+
+Relevant for us are the mgr, mon and operator. To stop the cluster, we
+will shutdown the deployments in the following order:
+
+* rook-ceph-operator: to prevent deployments to recover
+
+### Data / configuration comparison
+
+Logging into a host that is running mon-a, we find the following data
+in it:
+
+```
+[17:36] server56.place5:/var/lib/rook# find
+.
+./mon-a
+./mon-a/data
+./mon-a/data/keyring
+./mon-a/data/min_mon_release
+./mon-a/data/store.db
+./mon-a/data/store.db/LOCK
+./mon-a/data/store.db/000006.log
+./mon-a/data/store.db/000004.sst
+./mon-a/data/store.db/CURRENT
+./mon-a/data/store.db/MANIFEST-000005
+./mon-a/data/store.db/OPTIONS-000008
+./mon-a/data/store.db/OPTIONS-000005
+./mon-a/data/store.db/IDENTITY
+./mon-a/data/kv_backend
+./rook-ceph
+./rook-ceph/crash
+./rook-ceph/crash/posted
+./rook-ceph/log
+```
+
+Which is pretty similar to the native nodes:
+
+```
+[17:37:50] red3.place5:/var/lib/ceph/mon/ceph-red3# find
+.
+./sysvinit
+./keyring
+./min_mon_release
+./kv_backend
+./store.db
+./store.db/1959645.sst
+./store.db/1959800.sst
+./store.db/OPTIONS-3617174
+./store.db/2056973.sst
+./store.db/3617348.sst
+./store.db/OPTIONS-3599785
+./store.db/MANIFEST-3617171
+./store.db/1959695.sst
+./store.db/CURRENT
+./store.db/LOCK
+./store.db/2524598.sst
+./store.db/IDENTITY
+./store.db/1959580.sst
+./store.db/2514570.sst
+./store.db/1959831.sst
+./store.db/3617346.log
+./store.db/2511347.sst
+```
+
+### Checking how monitors are created on native ceph
+
+To prepare for the migration we take 1 step back and verify how
+monitors are created in the native cluster. The script used for
+monitoring creation can be found on
+[code.ungleich.ch](https://code.ungleich.ch/ungleich-public/ungleich-tools/src/branch/master/ceph/ceph-mon-create-start)
+and contains the following logic:
+
+* get "mon." key
+* get the monmap
+* Run ceph-mon --mkfs using the monmap and keyring
+* Start it
+
+In theory we could re-use these steps on a rook deployed monitor to
+join our existing cluster.
+
+### Checking the toolbox and monitor pods for migration
+
+When the ceph-toolbox is deployed, we get a ceph.conf and a keyring in
+/ect/ceph. The keyring is actually the admin keyring and allows us to
+make modifications to the ceph cluster. The ceph.conf points to the
+monitors and does not contain an fsid.
+
+The ceph-toolbox gets this informatoin via 1 configmap
+("rook-ceph-mon-endpoints") and a secret ("rook-ceph-mon").
+
+The monitor pods on the other hand have an empty ceph.conf and no
+admin keyring deployed.
+
+### Try 1: recreating a monitor inside the existing cluster
+
+Let's try to reuse an existing monitor and join it into the existing
+cluster. For this we will first shut down the rook-operator, to
+prevent it to intefere with our migration. Then
+modify the relevant configmaps and secrets and import the settings
+from the native cluster.
+
+Lastly we will patch one of the monitor pods, inject the monmap from
+the native cluster and then restart it.
+
+Let's give it a try. First we shutdown the rook-ceph-operator:
+
+```
+% kubectl -n rook-ceph scale --replicas=0 deploy/rook-ceph-operator
+deployment.apps/rook-ceph-operator scaled
+```
+
+Then we patch the mon deployments to not run a monitor, but only
+sleep:
+
+```
+for mon in a b c d e; do
+kubectl -n rook-ceph patch deployment rook-ceph-mon-${mon} -p \
+'{"spec": {"template": {"spec": {"containers": [{"name": "mon", "command": ["sleep", "infinity"], "args": []}]}}}}';
+
+kubectl -n rook-ceph patch deployment rook-ceph-mon-$mon  --type='json' -p '[{"op":"remove", "path":"/spec/template/spec/containers/0/livenessProbe"}]'
+done
+```
+
+No the pod is restarted and when we execute into it, we will see that
+no monitor is running in it:
+
+```
+% kubectl -n rook-ceph exec -ti rook-ceph-mon-a-c9f8f554b-2fkhm -- sh
+Defaulted container "mon" out of: mon, chown-container-data-dir (init), init-mon-fs (init)
+sh-4.2# ps aux
+USER       PID %CPU %MEM    VSZ   RSS TTY      STAT START   TIME COMMAND
+root         1  0.0  0.0   4384   664 ?        Ss   19:44   0:00 sleep infinity
+root         7  0.0  0.0  11844  2844 pts/0    Ss   19:44   0:00 sh
+root        13  0.0  0.0  51752  3384 pts/0    R+   19:44   0:00 ps aux
+sh-4.2#
+```
+
+Now for this pod to work with our existing cluster, we want to import
+the monmap and join the monitor to the native cluster. As with any
+mon, the data is stored below `/var/lib/ceph/mon/ceph-a/`.
+
+Before importing the monmap, let's have a look at the different rook
+configurations that influence the ceph components
+
+### Looking at the ConfigMap in detail: rook-ceph-mon-endpoints
+
+As the name says, it contains the list of monitor endpoints:
+
+```
+kubectl -n rook-ceph edit configmap rook-ceph-mon-endpoints
+...
+
+  csi-cluster-config-json: '[{"clusterID":"rook-ceph","monitors":["[2a0a:e5c0:0:15::fc2]:6789"...
+  data: b=[2a0a:e5c0:0:15::9cd9]:6789,....
+  mapping: '{"node":{"a":{"Name":"server56","Hostname":"server56","Address":"2a0a:e5c0::...
+```
+
+As eventually we want the cluster and csi to use the in-cluster
+monitors, we don't need to modify it right away.
+
+### Looking at Secrets in detail: rook-ceph-admin-keyring
+
+The first interesting secret is **rook-ceph-admin-keyring**, which
+contains the admin keyring. The old one of course, so we can edit this
+secret and replace it with the client.admin secret from our native
+cluster.
+
+We encode the original admin keyring using:
+
+```
+cat ceph.client.admin.keyring  | base64 -w 0; echo ""
+```
+
+And then we update the secret it:
+
+```
+kubectl -n rook-ceph edit secret rook-ceph-admin-keyring
+```
+
+[done]
+
+### Looking at Secrets in detail: rook-ceph-config
+
+This secret contains two keys, **mon_host** and
+**mon_initial_members**. The **mon_host** is a list of monitor
+addresses. The **mon_host** only contains the monitor names, a, b, c, d and e.
+
+The environment variable **ROOK_CEPH_MON_HOST** in the monitor
+deployment is set to to **mon_host** key of that secret, so monitors
+will read from it.
+
+
+
+### Looking at Secrets in detail: rook-ceph-mon
+
+This secret contains the following interesting keys:
+
+* ceph-secret: the admin key (just the base64 key no section around
+  it) [done]
+* ceph-username: "client.admin"
+* fsid: the ceph cluster fsid
+* mon-secret: The key of the [mon.] section
+
+It's important to mention to use `echo -n` when inserting
+the keys or fsids.
+
+[done]
+
+### Looking at Secrets in detail: rook-ceph-mons-keyring
+
+Contains the key "keyring" containing the [mon.] and [client.admin]
+sections:
+
+
+```
+[mon.]
+	key = ...
+
+[client.admin]
+	key = ...
+	caps mds = "allow"
+	caps mgr = "allow *"
+	caps mon = "allow *"
+	caps osd = "allow *"
+```
+
+Using `base64 -w0 <  ~/mon-and-client`.
+
+[done]
+
+### Importing the monmap
+
+Getting the current monmap from the native cluster:
+
+```
+ceph mon getmap -o monmap-20220903
+
+scp root@old-monitor:monmap-20220903
+```
+
+Adding it into the mon pod:
+
+```
+kubectl cp monmap-20220903 rook-ceph/rook-ceph-mon-a-6c46d4694-kxm5h:/tmp
+```
+
+Moving the old mon db away:
+
+```
+cd /var/lib/ceph/mon/ceph-a
+mkdir _old
+mv [a-z]* _old/
+```
+
+Recreating the mon fails, as the volume is mounted directly onto it:
+
+```
+% ceph-mon -i a --mkfs --monmap /tmp/monmap-20220903 --keyring /tmp/mon-key
+2022-09-03 21:44:48.268 7f1a738f51c0 -1 '/var/lib/ceph/mon/ceph-a' already exists and is not empty: monitor may already exist
+
+% mount | grep ceph-a
+/dev/sda1 on /var/lib/ceph/mon/ceph-a type ext4 (rw,relatime)
+
+```
+
+We can workaround this by creating all monitors on pods with other
+names. So we can create mon b to e on the mon-a pod and mon-a on any
+other pod.
+
+On rook-ceph-mon-a:
+
+```
+for mon in b c d e;
+do ceph-mon -i $mon --mkfs --monmap /tmp/monmap-20220903 --keyring /tmp/mon-key;
+done
+```
+
+On rook-ceph-mon-b:
+
+```
+mon=a
+ceph-mon -i $mon --mkfs --monmap /tmp/monmap-20220903 --keyring /tmp/mon-key
+```
+
+Then we export the newly created mon dbs:
+
+```
+for mon in b c d e;
+do kubectl cp rook-ceph/rook-ceph-mon-a-6c46d4694-kxm5h:/var/lib/ceph/mon/ceph-$mon ceph-$mon;
+done
+```
+
+```
+for mon in a;
+do kubectl cp rook-ceph/rook-ceph-mon-b-57d888dd9f-w8jkh:/var/lib/ceph/mon/ceph-$mon ceph-$mon;
+done
+```
+
+And finally we test it by importing the mondb to mon-a:
+
+```
+kubectl cp ceph-a
+rook-ceph/rook-ceph-mon-a-6c46d4694-kxm5h:/var/lib/ceph/mon/
+```
+
+And the other mons:
+
+```
+kubectl cp ceph-b rook-ceph/rook-ceph-mon-b-57d888dd9f-w8jkh:/var/lib/ceph/mon/
+
+```
+
+### Re-enabling the rook-operator
+
+As the deployment
+
+```
+kubectl -n rook-ceph scale --replicas=1 deploy/rook-ceph-operator
+```
+
+Operator sees them running (with a shell)
+
+```
+2022-09-03 22:29:26.725915 I | op-mon: mons running: [d e a b c]
+
+```
+
+Triggering recreation:
+
+```
+% kubectl  -n rook-ceph delete deployment rook-ceph-mon-a
+deployment.apps "rook-ceph-mon-a" deleted
+```
+
+Connected successfully to the cluster:
+
+```
+
+  services:
+    mon: 6 daemons, quorum red1,red2,red3,server4,server3,a (age 8s)
+    mgr: red3(active, since 8h), standbys: red2, red1, server4
+    osd: 46 osds: 46 up, 46 in
+
+```
+
+A bit later:
+
+```
+    mon: 8 daemons, quorum  (age 2w), out of quorum: red1, red2, red3, server4, server3, a, c,
+d
+    mgr: red3(active, since 8h), standbys: red2, red1, server4
+    osd: 46 osds: 46 up, 46 in
+
+```
+
+And a little bit later also the mgr joined the cluster:
+
+```
+  services:
+    mon: 8 daemons, quorum red2,red3,server4,server3,a,c,d,e (age 46s)
+    mgr: red3(active, since 9h), standbys: red1, server4, a, red2
+    osd: 46 osds: 46 up, 46 in
+
+```
+
+And a few minutes later all mons joined successfully:
+
+```
+    mon: 8 daemons, quorum red3,server4,server3,a,c,d,e,b (age 31s)
+    mgr: red3(active, since 105s), standbys: red1, server4, a, red2
+    osd: 46 osds: 46 up, 46 in
+
+```
+
+We also need to ensure the toolbox is being updated/recreated:
+
+```
+kubectl  -n rook-ceph delete pods rook-ceph-tools-5cf88dd58f-fwwlc
+```
+
+### Retiring the old monitors
+
+
+
+### The actual migration
+
+At this point we have 2 ceph clusters:
+
+* A new one in rook
+* The old/native one
+
+The next steps are:
+
+Replace fsid in secrets/rook-ceph-mon with that of the old one.
+
 
 ## Changelog