ceph01 192.168.11.21
ceph02 192.168.11.22
ceph03 192.168.11.23
cephOSD01 192.16811.31
cephOSD02 192.16811.32
cephOSD03 192.16811.33
cephOSD04 192.16811.34
Disable SELINUX and FIREWALLD
setenforce 0
grubby --update-kernel ALL --args selinux=0
systemctl stop firewalld.service
systemctl disable firewalld.service
systemctl mask firewalld.service
set your NIC interfaces to MTU 9000 in system-connections or in Netplan, for example
edit /etc/NetworkManager/system-connections/ens192.nmconnection add add :
[ethernet]
mtu=9000
Or set it by nmcli
~]# nmcli connection show "ens192" | grep 802-3-ethernet.mtu
802-3-ethernet.mtu: auto
====================================================================
sudo nmcli connection modify ens192 802-3-ethernet.mtu 9000
sudo nmcli connection down ens192 && sudo nmcli connection up ens192
====================================================================
~]# nmcli connection show "ens192" | grep 802-3-ethernet.mtu
802-3-ethernet.mtu: 9000
VERY IMPORTANT
Install chrony on all nodes !
dnf install chrony -y
IF YOU NEED ADD YOUR NTP SERVER TO CHRONY CONFIG
/etc/chrony.conf
server ntp.example.com iburst prefer
AFTER ALL BE CAREFUL TO WATCH THE TIMEDATECTL System clock synchronized STATUS to BE YESSSS !
INSTALLING CEPH
dnf install epel-release -y
dnf install nano wget bash-completion bind-utils podman -y
CEPH_RELEASE=19.2.2
curl -o /usr/local/bin/cephadm --silent --remote-name --location https://download.ceph.com/rpm-${CEPH_RELEASE}/el9/noarch/cephadm
chmod 755 /usr/local/bin/cephadm
~]# cephadm version
cephadm version 19.2.2 (0eceb0defba60152a8182f7bd87d164b639885b8) squid (stable)
cephadm add-repo --release 19.2.2
cephadm install ceph-common
~]# ceph -v
ceph version 19.2.2 (0eceb0defba60152a8182f7bd87d164b639885b8) squid (stable)
Install Ceph Crush tool:
~]# dnf install ceph-base -y
RUN BELOW COMMANDS ON FIRST NODE OR SELECT A NODE FOR INITIAL INSTALLATION
cephadm bootstrap --mon-ip --cluster-network --skip-monitoring-stack --with-centralized-logging
cephadm bootstrap --mon-ip 192.168.11.21 --skip-monitoring-stack
Ceph Dashboard is now available at:
URL: https://ceph01:8443/
User: admin
Password: 0npoabe73p
Go to Dashboard and change the initial password
Ceph mgr dashboard automatically redirect you to Active mgr web dashboard
~]# cat /etc/ceph/ceph.pub
add this key to all other hosts /root/.ssh/authorized_keys
adding other mon and mgr (control plain nodes) nodes
ceph orch host add <HOSTNAME> <HOST_IP> --labels _admin
ceph orch host add ceph02 192.168.11.22 --labels _admin
ceph orch host add ceph03 192.168.11.23 --labels _admin
ceph orch apply mon --placement="3 label:_admin"
ceph orch apply mgr --placement="3 label:_admin"
~]# ceph orch ls --service-type mon
NAME PORTS RUNNING REFRESHED AGE PLACEMENT
mon 3/3 5m ago 67s count:3;label:_admin
~]# ceph orch ls --service-type mgr
NAME PORTS RUNNING REFRESHED AGE PLACEMENT
mgr 3/3 5m ago 113s count:3;label:_admin
َِِADD OSD NODES
ceph orch host add cephosd01 192.168.11.31
ceph orch host add cephosd02 192.168.11.32
ceph orch host add cephosd03 192.168.11.33
ceph orch host add cephosd04 192.168.11.34
see ceph -s output
if some nodes has not been synchronized in clock by clock skew warning restart it, for example:
~]# ceph orch daemon restart mon.ceph02
add Disk to VMs by nvme bus
~]# ceph orch device ls | grep -i nvme
cephosd01 /dev/nvme0n1 ssd VMware_Virtual_NVMe_Disk_VMware_NVME_0000 5120M Yes 0s ago
ceph orch daemon add osd cephosd01:/dev/nvme0n1
add Diks to vm by bigger size than nvme to VMs by SATA bus
~]# ceph orch device ls | grep -i hdd
cephosd01 /dev/sdb hdd ATA_VMware_Virtual_SATA_Hard_Drive_00000000000000000001 10.0G Yes 6s ago
add OSD to cluster :
ceph orch daemon add osd cephosd01:/dev/sdb
If you want to change the class of an OSD for any reason, you can do so with these commands:
ceph osd crush rm-device-class 4 5 6 7
ceph osd crush set-device-class hdd 4 5 6 7
Upeer Commande make OSD 4 5 6 7 to hdd class
create proper crush rule for SSD and HDD rule, be notice this is OK for test and lab env but in a production and a real datacenter, you need to define more precise and granular type like PDU or RACK in CRUSH rule map
ceph osd crush rule create-replicated replicated_hdd default host hdd
ceph osd crush rule create-replicated replicated_ssd default host ssd
For production act like this :
get crush map on binery file
ceph osd getcrushmap -o crushmap
crushtool -d crushmap -o crushmap.txt
Edit CRUSH map file by an editor:
nano crushmap.txt
crushmap.txt
# begin crush map
tunable choose_local_tries 0
tunable choose_local_fallback_tries 0
tunable choose_total_tries 50
tunable chooseleaf_descend_once 1
tunable chooseleaf_vary_r 1
tunable chooseleaf_stable 1
tunable straw_calc_version 1
tunable allowed_bucket_algs 54
# devices
device 0 osd.0 class ssd
device 1 osd.1 class ssd
device 2 osd.2 class ssd
device 3 osd.3 class ssd
device 4 osd.4 class hdd
device 5 osd.5 class hdd
device 6 osd.6 class hdd
device 7 osd.7 class hdd
# types
type 0 osd
type 1 host
type 2 chassis
type 3 rack
type 4 row
type 5 pdu
type 6 pod
type 7 room
type 8 datacenter
type 9 zone
type 10 region
type 11 root
# buckets
host cephosd04 {
id -3 # do not change unnecessarily
id -2 class ssd # do not change unnecessarily
id -11 class hdd # do not change unnecessarily
# weight 0.01469
alg straw2
hash 0 # rjenkins1
item osd.3 weight 0.00490
item osd.7 weight 0.00980
}
host cephosd01 {
id -5 # do not change unnecessarily
id -4 class ssd # do not change unnecessarily
id -12 class hdd # do not change unnecessarily
# weight 0.01469
alg straw2
hash 0 # rjenkins1
item osd.0 weight 0.00490
item osd.4 weight 0.00980
}
host cephosd02 {
id -7 # do not change unnecessarily
id -6 class ssd # do not change unnecessarily
id -13 class hdd # do not change unnecessarily
# weight 0.01469
alg straw2
hash 0 # rjenkins1
item osd.1 weight 0.00490
item osd.5 weight 0.00980
}
host cephosd03 {
id -9 # do not change unnecessarily
id -8 class ssd # do not change unnecessarily
id -14 class hdd # do not change unnecessarily
# weight 0.01469
alg straw2
hash 0 # rjenkins1
item osd.2 weight 0.00490
item osd.6 weight 0.00980
}
root default {
id -1 # do not change unnecessarily
id -10 class ssd # do not change unnecessarily
id -15 class hdd # do not change unnecessarily
# weight 0.05878
alg straw2
hash 0 # rjenkins1
item cephosd04 weight 0.01469
item cephosd01 weight 0.01469
item cephosd02 weight 0.01469
item cephosd03 weight 0.01469
}
# rules
rule replicated_rule {
id 0
type replicated
step take default
step chooseleaf firstn 0 type host
step emit
}
rule replicated_ssd {
id 1
type replicated
step take default class ssd
step chooseleaf firstn 0 type host
step emit
}
rule replicated_hdd {
id 2
type replicated
step take default class hdd
step chooseleaf firstn 0 type host
step emit
}
# end crush map
# begin crush map
tunable choose_local_tries 0
tunable choose_local_fallback_tries 0
tunable choose_total_tries 50
tunable chooseleaf_descend_once 1
tunable chooseleaf_vary_r 1
tunable chooseleaf_stable 1
tunable straw_calc_version 1
tunable allowed_bucket_algs 54
# devices
device 0 osd.0 class ssd
device 1 osd.1 class ssd
device 2 osd.2 class ssd
device 3 osd.3 class ssd
device 4 osd.4 class hdd
device 5 osd.5 class hdd
device 6 osd.6 class hdd
device 7 osd.7 class hdd
# types
type 0 osd
type 1 host
type 2 chassis
type 3 rack
type 4 row
type 5 pdu
type 6 pod
type 7 room
type 8 datacenter
type 9 zone
type 10 region
type 11 root
# buckets
host cephosd04 {
id -3
id -2 class ssd
id -11 class hdd
# weight 0.01469
alg straw2
hash 0 # rjenkins1
item osd.3 weight 0.00490
item osd.7 weight 0.00980
}
host cephosd01 {
id -5
id -4 class ssd
id -12 class hdd
# weight 0.01469
alg straw2
hash 0 # rjenkins1
item osd.0 weight 0.00490
item osd.4 weight 0.00980
}
host cephosd02 {
id -7
id -6 class ssd
id -13 class hdd
# weight 0.01469
alg straw2
hash 0 # rjenkins1
item osd.1 weight 0.00490
item osd.5 weight 0.00980
}
host cephosd03 {
id -9
id -8 class ssd
id -14 class hdd
# weight 0.01469
alg straw2
hash 0 # rjenkins1
item osd.2 weight 0.00490
item osd.6 weight 0.00980
}
# ADDED: Define new rack buckets
rack rack1 {
id -16
# weight 0.02938
alg straw2
hash 0 # rjenkins1
item cephosd01 weight 0.01469
item cephosd02 weight 0.01469
}
rack rack2 {
id -17
# weight 0.02938
alg straw2
hash 0 # rjenkins1
item cephosd03 weight 0.01469
item cephosd04 weight 0.01469
}
# MODIFIED: Root bucket now contains racks instead of hosts
root default {
id -1
id -10 class ssd
id -15 class hdd
# weight 0.05876
alg straw2
hash 0 # rjenkins1
item rack1 weight 0.02938
item rack2 weight 0.02938
}
# rules
rule replicated_rule {
id 0
type replicated
step take default
step chooseleaf firstn 0 type host
step emit
}
rule replicated_ssd {
id 1
type replicated
step take default class ssd
step chooseleaf firstn 0 type host
step emit
}
rule replicated_hdd {
id 2
type replicated
step take default class hdd
step chooseleaf firstn 0 type host
step emit
}
# ADDED: New rules with rack as failure domain
rule ssd_rack_rule {
id 3
type replicated
# Start at the root and select the SSD class hierarchy
step take default class ssd
# Select leaves from different racks
step chooseleaf firstn 0 type rack
step emit
}
rule hdd_rack_rule {
id 4
type replicated
# Start at the root and select the HDD class hierarchy
step take default class hdd
# Select leaves from different racks
step chooseleaf firstn 0 type rack
step emit
}
# end crush map
Decompile txt CRUSH map file to binery and inject it to ceph cluster map rule
crushtool -c crushmap-modified.txt -o crushmap-new.bin
ceph osd setcrushmap -i crushmap-new.bin
~]# ceph osd crush rule ls
replicated_rule
replicated_ssd
replicated_hdd
ssd_rack_rule
hdd_rack_rule
Now its time to create POOLs
ceph osd pool create ssd_pool 64 64 replicated ssd_rack_rule --autoscale-mode=warn
ceph osd pool set ssd_pool size 2
ceph osd pool create hdd_pool 64 64 replicated hdd_rack_rule --autoscale-mode=warn
ceph osd pool set hdd_pool size 2
ceph osd pool set hdd_pool pg_num 16
IF you need to delete pool, you should set a allow delete pool flag
ceph config set mon mon_allow_pool_delete true
ceph osd pool ls detail
pool 1 '.mgr' replicated size 3 min_size 2 crush_rule 0 object_hash rjenkins pg_num 1 pgp_num 1 autoscale_mode on last_change 17 flags hashpspool stripe_width 0 pg_num_max 32 pg_num_min 1 application mgr read_balance_score 7.89
pool 4 'ssd_pool' replicated size 2 min_size 1 crush_rule 3 object_hash rjenkins pg_num 64 pgp_num 64 autoscale_mode warn last_change 89 flags hashpspool stripe_width 0 read_balance_score 1.25
pool 5 'hdd_pool' replicated size 2 min_size 1 crush_rule 4 object_hash rjenkins pg_num 64 pgp_num 64 autoscale_mode warn last_change 96 flags hashpspool stripe_width 0 read_balance_score 1.19
MAKE POOLs ready for RBD
rbd pool init hdd_pool
rbd pool init ssd_pool
~]# ceph osd pool application get ssd_pool
{
"rbd": {}
}
Now you can use this pool for OpenStack Cinder or NOVA or even use it directly in qemu/KVM by mount image volume in host and add it by its path to VM, For OpenStack Refer to the document. For Manual mouting is like this and use it KVM by Virt-Manager :