ceph部署

1. 部署

1.1 修改host并配置互信（所有节点都需执行）

vim /etc/hosts
10.244.0.228 ubuntu-vm-2404-test-1
10.244.0.155 ubuntu-vm-2404-test-2
10.244.0.226 ubuntu-vm-2404-test-3

1.2 安装 Docker（所有节点都需执行）

sudo apt install docker.io

1.3 SSH 免密登录（所有节点都需执行）注意必须执行下一步的拷贝公钥才算完成免密

sudo su # 必须以root身份
ssh-keygen -t rsa
# 拷贝密钥
ssh-copy-id ubuntu-vm-2404-test-1
ssh-copy-id ubuntu-vm-2404-test-2
ssh-copy-id ubuntu-vm-2404-test-3

1.4 使用 cephadm 安装 ceph 集群（在一台上执行）

# 安装 cephadm
apt install cephadm -y
# 启用集群
cephadm bootstrap --mon-ip 10.244.0.228
# 安装 ceph-cli
apt install ceph-common -y
# 集群状态信息
ceph -s
# 查看节点信息
ceph orch host ls
# 拷贝公钥
ssh-copy-id -f -i /etc/ceph/ceph.pub ubuntu-vm-2404-test-2
ssh-copy-id -f -i /etc/ceph/ceph.pub ubuntu-vm-2404-test-3
# 添加节点
ceph orch host add ubuntu-vm-2404-test-2
ceph orch host add ubuntu-vm-2404-test-3
# 查看节点信息
ceph orch host ls
# 列出可用设备，有延迟，一般不准确
ceph orch device ls
# 添加所有可用磁盘到集群
# 注意，此命令的效果是持久的，会自动添加新的可用的磁盘到集群中。
# https://docs.ceph.com/en/latest/cephadm/services/osd/
# 如果需要取消，可以执行ceph orch apply osd --all-available-devices --unmanaged=true
ceph orch apply osd --all-available-devices
# 添加指定磁盘到集群，磁盘必须未格式化，可以选择某个特定分区
# 不建议使用raw方式，更推荐使用lvm方式：https://docs.ceph.com/en/latest/ceph-volume/intro/ 
# 不会对性能产生明显影响，而且易于管理（易于扩容等）
# bluestore是ceph管理的存储引擎。如果使用raw的话，ceph会在磁盘上做标记为bluestore，但是依赖于lvm方式的不会做标记。
# 无论使用raw还是lvm，都会使用bluestore存储引擎。bluestore替代旧版的filestore引擎，在稳定性与性能上均有提升。
# https://cloud.tencent.com/developer/article/2314578
sudo ceph orch daemon add osd ubuntu-vm-2404-test-2:/dev/sda
sudo ceph orch daemon add osd --method raw ubuntu-vm-2404-test-2:/dev/sda # 不推荐使用
# 查看 osd 状态
ceph osd tree

1.5 初始化（在一台上执行）

# 创建数据pool
ceph osd pool create test
ceph osd pool set test bulk true
# 创建元数据pool
ceph osd pool create cephfs_metadata 128
# 创建元数据部署
ceph orch apply mds test --placement="1 ubuntu-vm-2404-test-1"
# 关联数据pool和元数据pool
ceph osd pool application enable test cephfs
# 创建文件系统(其中cephfs为文件系统名字)
ceph fs new cephfs_name cephfs_metadata test
# 创建子卷
ceph fs subvolumegroup create cephfs_name csi
# 创建rgw
ceph orch apply rgw default-realm default-zone --placement="3 k10 k11 k12"
ceph orch apply rgw infra_rgw --placement='3 k10 k11 k12' --port=8000
# 创建对象存储
radosgw-admin user create --uid=s3 --display-name="object_storage" --system
# 记住你的access_key和secret_key
"keys": [
        {
            "user": "s3",
            "access_key": "ENL7QVDGNNYNNEX3X3VS",
            "secret_key": "vaUjPhUkR8yLAdqVD6FRnXGVNrxBNDs9bMWFb6Kb",
            "active": true,
            "create_date": "2025-03-10T02:55:13.039290Z"
        }
    ],
# 创建对象桶
radosgw-admin bucket create --bucket=<bucket-name> --user=<username>


# 创建自定义realm的rgw
# 删除默认zone和zonegroup
radosgw-admin zone list
radosgw-admin zone delete  --rgw-zone default
radosgw-admin zonegroup list
radosgw-admin zonegroup delete  --rgw-zonegroup default
# 创建realm、zone和zonegroup
radosgw-admin realm create --rgw-realm=default --default
radosgw-admin zonegroup create --rgw-zonegroup default --rgw-realm default --master --default
radosgw-admin zone create --rgw-zonegroup default --rgw-zone default --master --default
# 设定default（前面default是命令后面是名字）
radosgw-admin realm default default
radosgw-admin zonegroup default default
radosgw-admin zone default default
# 设定master
radosgw-admin zonegroup get > zonegroup.json
vim zonegroup.json # 将其中is_master字段改为true
radosgw-admin zonegroup set --infile zonegroup.json # 导入配置
# 同步period，要执行两次
radosgw-admin period update --commit
radosgw-admin period update --commit
# 部署rgw，必须指定realm和zone
ceph orch apply rgw default --realm=default --zone=default  --placement='3 k10 k11 k12'


# 部署rbd
## rbd只需要创建pool即可，不需要部署daemon
ceph osd pool create rbd_pool 64 64
ceph osd pool application enable rbd_pool rbd
## 创建具体的块设备
rbd create -p rbd_pool --image ceph-rbd-demo.img --size 10G
## 查看当前的块设备列表
rbd -p rbd_pool ls
## 查看详细信息
rbd -p rbd_pool info ceph-rbd-demo.img
## 关闭features
rbd -p rbd_pool --image ceph-rbd-demo.img feature disable deep-flatten
rbd -p rbd_pool --image ceph-rbd-demo.img feature disable fast-diff
rbd -p rbd_pool --image ceph-rbd-demo.img feature disable object-map
rbd -p rbd_pool --image ceph-rbd-demo.img feature disable exclusive-lock
## map到本地
rbd map -p rbd_pool --image ceph-rbd-demo.img
## 卸载设备
rbd device unmap -p rbd_pool --image ceph-rbd-demo.img

1.6 配置密钥（客户端执行）

# 找到某用户密钥
cat /etc/ceph/ceph.client.admin.keyring
# 复制密钥
echo "AQCpx8Zn2nTWMxAAvqX4K3Limi6qYmqh9XKTsw==" > secret

用户创建流程可以参考ceph（二）CephX认证授权、用户管理和keyring - areke - 博客园

ceph auth add client.test mon 'allow *' osd 'allow *' mds 'allow *'
ceph auth list

1.7 挂载（客户端执行）

sudo mkdir /mnt/cephfs
sudo mount -t ceph ubuntu-vm-2404-test-1:6789:/ /mnt/cephfs -o name=admin,secretfile=secret

# df -h # 实际容量由于三备份会大约为1/3
Filesystem           Size  Used Avail Use% Mounted on
tmpfs                748M  2.2M  746M   1% /run
/dev/vda1             18G  4.8G   13G  28% /
tmpfs                3.7G   16K  3.7G   1% /dev/shm
tmpfs                5.0M     0  5.0M   0% /run/lock
/dev/vda16           881M   61M  758M   8% /boot
/dev/vda15           105M  6.1M   99M   6% /boot/efi
tmpfs                748M   12K  748M   1% /run/user/0
tmpfs                748M   12K  748M   1% /run/user/1000
10.244.0.228:6789:/   18G     0   18G   0% /mnt/cephfs

# sudo ceph -s
  cluster:
    id:     f020f9e9-f8da-11ef-9430-4eecb663651b
    health: HEALTH_OK

  services:
    mon: 1 daemons, quorum ubuntu-vm-2404-test-1 (age 4h)
    mgr: ubuntu-vm-2404-test-1.suvfjb(active, since 4h), standbys: ubuntu-vm-2404-test-2.ppaxtx
    mds: 1/1 daemons up
    osd: 6 osds: 6 up (since 59m), 6 in (since 59m)

  data:
    volumes: 1/1 healthy
    pools:   3 pools, 145 pgs
    objects: 24 objects, 585 KiB
    usage:   461 MiB used, 56 GiB / 57 GiB avail
    pgs:     145 active+clean

1.8 客户端

# 先安装客户端
sudo apt install ceph-common
# 复制密钥和配置，以下两条在服务器端执行
sudo scp /etc/ceph/ceph.conf ethereal@10.24.0.118:/home/ethereal/Downloads/ceph
sudo scp /etc/ceph/ceph.client.admin.keyring ethereal@10.244.0.118:/home/ethereal/Downloads/ceph
# 以下客户端执行
sudo cp ceph.conf /etc/ceph/
sudo cp ceph.client.admin.keyring /etc/ceph
echo "AQCpx8Zn2nTWMxAAvqX4K3Limi6qYmqh9XKTsw==" > secret # 密钥来自于ceph.client.admin.keyring
# 客户端挂载
sudo mount -t ceph :/ /mnt/cephfs -o name=admin # 优先挂载v2版本，推荐v2方式，在速度与安全性方面都有提升，https://www.bookstack.cn/read/ceph-en/de5b43971cfd01ae.md#msgr2-protocol

apt install s3cmd
s3cmd --configure
# 1. 使用终端完成配置
# Access Key：刚才创建的radosgw user的access_key
# Secret Key：刚才创建的radosgw user的secret_key
# Default Region：默认直接回车，使用US
# S3 Endpoint：IP地址:port，例如“192.168.64.128:80”
# DNS-style bucket+hostname：“bootstrap_host_ip:80/%(bucket)s”，如"192.168.64.128:80/%(bucket)s"
# Encryption password：默认直接回车，不需要密码
# Path to GPG program [/usr/bin/gpg]：默认直接回车
# Use HTTPS protocol [No]: no，不使用HTTPS
# HTTP Proxy server name: 默认直接回车
# Test access with supplied credentials? [Y/n] 默认直接回车
# 2. 最后保存设置，会生成/root/.s3cfg文件
# 3. 修改刚生成的/root/.s3cfg中的三处配置
# cloudfront_host = [serverIP]（改成自己的服务端的IP）
# host_base = [serverIP]:[Port]（改成自己的服务端的IP和端口）
# host_bucket = [serverIP]:[Port]/%(bucket)（改成自己的服务端的IP和端口）
s3cmd ls
s3cmd mb s3://default-bucket # 创建bucket
s3cmd mb s3://default-bucket -v # debug模式
s3cmd put values.yaml s3://default-bucket/values.yaml # 上传文件


# rbd使用
## map到本地，其中keyfile内容只有key，例如AQDgb+pnZNLsNhAA2J83AIzrzFDB1AlYGjCoAQ==
rbd map -p rbd_pool --image ceph-rbd-demo.img
rbd --id admin -m 10.144.96.10:3300,10.144.96.11:3300,10.144.96.12:3300 --keyfile=***stripped*** map rbd_pool/ceph-rbd-demo.img --device-type krbd --options noudev
## 卸载设备
rbd device unmap -p rbd_pool --image ceph-rbd-demo.img

1.9 设置时间

# （服务端，所有节点）
# 启用时间同步
timedatectl set-ntp true
# 设置时区 Asia/Shanghai
timedatectl set-timezone Asia/Shanghai
# 查看状态
timedatectl status


# （服务端，单个节点）
ceph config set mon mon_clock_drift_allowed 0.5
ceph config set mon mon_clock_drift_warn_backoff 10

1.10 下线

# 下线OSD
# 删除守护进程
ceph orch daemon rm osd.0 --force
# 删除crush图节点
ceph osd crush remove osd.0
# 剔除osd
ceph osd down osd.0
ceph osd out osd.0
ceph auth del osd.0
# 删除osd
ceph osd rm osd.0

# 擦除数据，必须擦除数据后才可重新加入集群
wipefs -af /dev/sdb
ceph orch device zap k10 /dev/sdb --force


# 下线pool
# 删除所有相关mds
ceph orch rm mds.infra-meta
# 标记pool为fail
ceph fs fail cephfs-infra
# 查看当前fs状态
ceph fs status
# 删除pool
ceph config set mon mon_allow_pool_delete true
ceph osd pool rm infra-meta infra-meta --yes-i-really-really-mean-it

删除fs

ceph fs fail test_cephfs
ceph fs rm test_cephfs --yes-i-really-mean-it
ceph osd pool application disable cephfs_data_pool cephfs --yes-i-really-mean-it
ceph config set mon mon_allow_pool_delete true
ceph osd pool rm cephfs_data_pool cephfs_data_pool --yes-i-really-really-mean-it
ceph osd pool rm cephfs_meta_pool cephfs_meta_pool --yes-i-really-really-mean-it

1.11 分层缓存

缓存模式可以参考ceph 缓存分层 - 知乎

# 创建一个缓存层
ceph osd tier add cold-storage hot-storage
# 设置缓存模式
ceph osd tier cache-mode hot-storage writeback
# 将客户端的流量从存储池重定向到缓存池
ceph osd tier set-overlay cold-storage hot-storage

ceph osd pool set {cachepool} hit_set_type bloom
ceph osd pool set {cachepool} hit_set_count 1
ceph osd pool set {cachepool} hit_set_period 300  # 300s 后触发hitset
ceph osd pool set {cachepool} target_max_bytes 1000000000 # 1G 
ceph osd pool set {cachepool} target_max_objects 100 # 100个objects后触发下刷
ceph osd pool set {cachepool} cache_min_flush_age 300 # 300s 后触发下刷
ceph osd pool set {cachepool} cache_min_evict_age 300 # 300s 后触发下刷
ceph osd pool set {cachepool} cache_target_dirty_ratio 0.01
ceph osd pool set {cachepool} cache_target_full_ratio 0.02

1.12 划分OSD

# 导出原本的osd map
ceph osd getcrushmap -o ./tmp/crushmap.ori
# 反编译osd map
crushtool -d crushmap.ori -o decrushmap.ori
# 定义bucket
root hdd {
        id -21           # do not change unnecessarily
        id -22 class hdd         # do not change unnecessarily
        # weight 1.935
        alg straw2
        hash 0  # rjenkins1
        item osd.0 weight 0.488
        item osd.1 weight 0.488
        item osd.2 weight 0.488
}

# 桶层次：type 0 osd，type 1 host，type 2 chassis，type 3 rack，type 4 row，type 5 pdu，type 6 pod，type 7 room，type 8 datacenter，type 9 region，type 10 root

# 修改规则
rule ssd{
    id 1
    type replicated
    min_size 1
    max_size 10
    step take d
    step chooseleaf firstn 0 type osd
    step emit
}
rule hdd{
        id 2
        type replicated
        min_size 1
        max_size 10
        step take hdd
        step chooseleaf firstn 0 type osd # 这里是说从规则中选取osd
        step emit
}
# 编译osd map
crushtool -c decrushmap.new -o crushmap.new
# 导入map
ceph osd setcrushmap -i ./crushmap.new
# 设定某个存储池的规则
ceph osd pool set ssd_pool crush_rule ssd
# 修改ceph.conf防止回滚，在global中加入如下字段
osd_crush_update_on_start=false


# pg手动分配
# 查看当前容量
ceph osd df
# 查看某个osd的占用
ceph pg ls-by-osd osd.3 | egrep ^1
# 移动pg，从3到2
ceph osd pg-upmap-items 1.77 3 2

1.13 崩溃恢复

Ceph集群显示XXX daemons have recently crashed警告-CSDN博客

# 列出崩溃信息
ceph crash ls-new
# 归档新的崩溃记录
ceph crash archive-all
# 集群信息存储位置，在所有节点上都存在
cd /var/lib/ceph/<cluster-id>

镜像008 Ceph集群数据同步 - 梦中泪 - 博客园

备份如何备份和恢复Ceph集群的配置和数据？请分别提供备份和恢复的步骤。 | 壹梵在线网络服务一凡在线

备份集群配置文件：

# 备份Ceph配置文件
cp /etc/ceph/* {备份目录}
备份MON（监控节点）的数据：

# 备份MON的数据
ceph mon dump --cluster {集群名} --format json > {备份目录}/mon_dump.json
备份OSD（对象存储守护进程）的数据：

# 备份OSD的数据
ceph osd dump --cluster {集群名} --format json > {备份目录}/osd_dump.json
备份RGW（对象网关）的数据（如果有）：

# 备份RGW的数据
radosgw-admin --cluster {集群名} backup export --file {备份目录}/rgw_backup.bin
备份MDS（元数据服务器）的数据（如果有）：

# 备份MDS的数据
ceph fs dump --cluster {集群名} --format json > {备份目录}/fs_dump.json
ceph mds getmap -o {备份目录}/mdsmap.bin


恢复集群配置文件：

# 恢复Ceph配置文件
cp {备份目录}/* /etc/ceph/
如果之前的集群已被清空或者不可用，可以重新初始化集群：

# 重新初始化Ceph集群
ceph-deploy new {MON节点，多个节点以逗号分隔}
ceph-deploy install {MON节点，多个节点以逗号分隔}
ceph-deploy mon create-initial
恢复MON的数据：

# 恢复MON的数据
ceph-mon --cluster {集群名} --mkfs -i {MON节点} --keyring /etc/ceph/{集群名}.mon.{MON节点}.keyring
ceph-mon --cluster {集群名} -i {MON节点}
恢复OSD的数据：

# 恢复OSD的数据
ceph-osd --cluster {集群名} --mkfs -i {OSD节点} --osd-data /var/lib/ceph/osd/{集群名}-{OSD节点}
ceph-osd --cluster {集群名} -i {OSD节点}
恢复RGW的数据（如果有）：

# 恢复RGW的数据
radosgw-admin --cluster {集群名} backup import --file {备份目录}/rgw_backup.bin
恢复MDS的数据（如果有）：

# 恢复MDS的数据
ceph-mds --cluster {集群名} --mkfs -i {MDS节点} --keyring /etc/ceph/{集群名}.mds.{MDS节点}.keyring
ceph-mds --cluster {集群名} -i {MDS节点}
ceph fs new {文件系统名称} {MDS节点1} {MDS节点2}
ceph osd pool create cephfs_metadata 8
ceph osd pool create cephfs_data 8

恢复速度设置

https://docs.ceph.com/en/latest/rados/configuration/osd-config-ref/#recovery

ceph数据recovery配置策略（数据recovery流量控制） - 钟桂耀 - 博客园

# 业务优先
ceph tell osd.* injectargs '--osd-max-backfills 1 --osd-recovery-max-active 1 --osd-recovery-max-single-start 1'
ceph tell osd.* injectargs '--osd-recovery-sleep 1'

# 恢复优先
ceph tell osd.* injectargs '--osd-max-backfills 5 --osd-recovery-max-active 5 --osd-recovery-max-single-start 5'
ceph tell osd.* injectargs '--osd-recovery-sleep 0'

# 以上为临时设置，永久设置需要修改config
ceph config set osd osd_recovery_max_active 10
ceph config set osd osd_max_backfills 10
ceph config set osd_recovery_max_single_start 1

# 需要注意，新的osd加入时，会采用默认设置
ceph config get osd osd_recovery_max_active

# 查看当前运行时配置
ceph tell osd.0 config show | grep recovery


osd_max_backfills : 一个osd上最多能有多少个pg同时做backfill。其中osd出去的最大backfill数量为osd_max_backfills ，osd进来的最大backfill数量也是osd_max_backfills ，所以每个osd最大的backfill数量为osd_max_backfills * 2；
osd_recovery_sleep: 出队列后先Sleep一段时间，拉长两个Recovery的时间间隔；
osd_recovery_max_active: 每个OSD上同时进行的所有PG的恢复操作（active recovery）的最大数量；（注意是恢复操作，不是恢复PG数，因此会收到下面参数的影响）
osd_recovery_max_single_start: OSD在某个时刻会为一个PG启动恢复操作数；


osd_max_backfills:默认值10. 一个osd上承载了多个pg。可能很多pg都需要做第二种recovery,即backfill。 设定这个参数来指明在一个osd上最多能有多少个pg同时做backfill。
osd_recovery_max_active：默认值15. 一个osd上可以承载多个pg, 可能好几个pg都需要recovery,这个值限定该osd最多同时有多少pg做recovery。
osd_recovery_max_single_start：默认值5. 这个值限定了每个pg可以启动recovery操作的最大数。
osd_recovery_max_chunk: 默认值8388608. 设置恢复数据块的大小，以防网络阻塞
osd_recovery_op_priority: 默认值10. osd修复操作的优先级, 可小于该值
osd_recovery_sleep: 默认值0. revocery的间隔

模拟坏盘

# 查看当前所在位置
ll /sys/block/sdc # 输出包含host0
# 模拟删除
echo 1 > /sys/block/sdc/device/delete
# 如果集群有写入，对应的 OSD 就很快 down 掉了


# 恢复
echo '- - -' > /sys/class/scsi_host/host0/scan
# 磁盘编号会改变，因此必须删除数据后重新加入osd（参考上面下线过程）


# 查看pg状态
ceph pg dump | grep recover

1.14 修改rgw的存储池

RGW池放置和存储类（Octopus版本） - Varden - 博客园

# 获取zonegroup
radosgw-admin zonegroup get
# 获取zone
radosgw-admin zone get
# 向区域组default中添加placement temporary
radosgw-admin zonegroup placement add \
      --rgw-zonegroup default \
      --placement-id temporary
# 向区域中添加placement细节，引用它所属区域组中的placement temporary
radosgw-admin zone placement add \
      --rgw-zone default \
      --placement-id temporary \
      --data-pool default.rgw.temporary.data \
      --index-pool default.rgw.temporary.index \
      --data-extra-pool default.rgw.temporary.non-ec
# 向区域组中default-placement的placement添加storage-class
radosgw-admin zonegroup placement add \
      --rgw-zonegroup default \
      --placement-id default-placement \
      --storage-class COLD
# 向区域中添加storageclass细节，指定data-pool
radosgw-admin zone placement add \
      --rgw-zone default \
      --placement-id default-placement \
      --storage-class COLD \
      --data-pool default.rgw.cold.data \
      --compression lz4
# 设定区域组 default默认放置目标
radosgw-admin zonegroup placement default \
      --rgw-zonegroup default \
      --placement-id new-placement


# 创建bucket时指定placement rule
# 利用--bucket-location覆盖用户的default_placement
s3cmd mb s3://second --bucket-location=":default-placement" 

1.15 升级

Ceph Releases (index) — Ceph Documentation

使用 Cephadm 升级 CEPH - Varden - 博客园

1.16 故障处理

2. 常见 OSD 故障处理 · Ceph 运维手册

apt remove ceph-osd
systemctl restart ceph.target
ceph orch daemon restart osd.5
# docker中磁盘映射关系：
docker inspect -f "{{.Mounts}}" 369a1376f78c
[{bind  /sys /sys   true rprivate} {bind  /run/lock/lvm /run/lock/lvm   true rprivate} {bind  /var/log/ceph/e30eab96-fa62-11ef-8818-246e96a3ad74 /var/log/ceph  z true rprivate} {bind  /run/udev /run/udev   true rprivate} {bind  /dev /dev   true rprivate} {bind  /run/lvm /run/lvm   true rprivate} {bind  / /rootfs   true rslave} {bind  /var/lib/ceph/e30eab96-fa62-11ef-8818-246e96a3ad74/osd.0 /var/lib/ceph/osd/ceph-0  z true rprivate} {bind  /var/lib/ceph/e30eab96-fa62-11ef-8818-246e96a3ad74/osd.0/config /etc/ceph/ceph.conf  z true rprivate} {bind  /var/run/ceph/e30eab96-fa62-11ef-8818-246e96a3ad74 /var/run/ceph  z true rprivate} {bind  /var/lib/ceph/e30eab96-fa62-11ef-8818-246e96a3ad74/crash /var/lib/ceph/crash  z true rprivate}]

1.17 配额

# 设置用户配额
radosgw-admin quota set --quota-scope=user --uid=uat --max-objects=10 --max-size=1024

# 设置bucket配额
radosgw-admin quota set --uid=uat --quota-scope=bucket --max-objects=10 --max-size=1024

# 启用禁用用户配额
radosgw-admin quota enable --quota-scope=user  --uid=uat
radosgw-admin quota-disable --quota-scope=user  --uid=uat

# 启用bucket配额
radosgw-admin quota enable --quota-scope=bucket  --uid=uat
radosgw-admin quota-disable --quota-scope=bucket  --uid=uat

# 获取配额信息
radosgw-admin user info --uid=uat

# 获取存储池配额
ceph osd pool get-quota test_map

# 设置存储池配额
ceph osd pool set-quota <poolname> max_bytes size

1.18 测试

写：必须先执行写并且添加--no-cleanup才可以执行读
rados bench -p rbd 10 write --no-cleanup

顺序读：
rados bench -p rbd 10 seq

随机读：
rados bench -p rbd 10 rand

删除rados bench命令创建的数据：
rados -p rbd cleanup

查看磁盘io:
`iotop -P`


cur 是current的缩写 
cur MB/s 当前速度
avg MB/s 平均速度
Bandwidth (MB/sec): 吞吐量
Average IOPS: 平均iops
Stddev IOPS: 标准偏差
Average Latency(s): 平均延迟

测试结果：

# hdd write
Total time run:         10.3334
Total writes made:      559
Write size:             4194304
Object size:            4194304
Bandwidth (MB/sec):     216.387
Stddev Bandwidth:       10.8403
Max bandwidth (MB/sec): 232
Min bandwidth (MB/sec): 204
Average IOPS:           54
Stddev IOPS:            2.71006
Max IOPS:               58
Min IOPS:               51
Average Latency(s):     0.294018
Stddev Latency(s):      0.194231
Max latency(s):         0.892445
Min latency(s):         0.024855


# hdd read
Total time run:       2.664
Total reads made:     559
Read size:            4194304
Object size:          4194304
Bandwidth (MB/sec):   839.338
Average IOPS:         209
Stddev IOPS:          73.5391
Max IOPS:             283
Min IOPS:             179
Average Latency(s):   0.0730044
Max latency(s):       0.495011
Min latency(s):       0.00412251  

# ssd write
Total time run:         10.0774
Total writes made:      1787
Write size:             4194304
Object size:            4194304
Bandwidth (MB/sec):     709.311
Stddev Bandwidth:       60.0577
Max bandwidth (MB/sec): 784
Min bandwidth (MB/sec): 608
Average IOPS:           177
Stddev IOPS:            15.0144
Max IOPS:               196
Min IOPS:               152
Average Latency(s):     0.0899146
Stddev Latency(s):      0.0832786
Max latency(s):         1.12338
Min latency(s):         0.0246071  


# ssd read
Total time run:       5.86502
Total reads made:     1787
Read size:            4194304
Object size:          4194304
Bandwidth (MB/sec):   1218.75
Average IOPS:         304
Stddev IOPS:          27.335
Max IOPS:             312
Min IOPS:             244
Average Latency(s):   0.0513641
Max latency(s):       0.241155
Min latency(s):       0.00696541  


# cache write
Total time run:         10.0538
Total writes made:      1939
Write size:             4194304
Object size:            4194304
Bandwidth (MB/sec):     771.446
Stddev Bandwidth:       61.3638
Max bandwidth (MB/sec): 836
Min bandwidth (MB/sec): 660
Average IOPS:           192
Stddev IOPS:            15.3409
Max IOPS:               209
Min IOPS:               165
Average Latency(s):     0.0827728
Stddev Latency(s):      0.0337953
Max latency(s):         0.290505
Min latency(s):         0.0306568  


# cache read
Total time run:       6.85431
Total reads made:     1939
Read size:            4194304
Object size:          4194304
Bandwidth (MB/sec):   1131.55
Average IOPS:         282
Stddev IOPS:          40.6596
Max IOPS:             349
Min IOPS:             228
Average Latency(s):   0.0553511
Max latency(s):       0.252595
Min latency(s):       0.00646825

1.19 修改监控

查看当前地址

# ceph config get mgr
WHO     MASK  LEVEL     OPTION                                VALUE
                                 RO
mgr           advanced  container_image                       quay.io/ceph/ceph@sha256:41d3f5e46ff7de28544cc8869fdea13fca824dcef83936cb3288ed9de935e4de  *
mgr           advanced  mgr/cephadm/container_init            True
                                 *
mgr           advanced  mgr/cephadm/migration_current         7
                                 *
mgr           advanced  mgr/dashboard/ALERTMANAGER_API_HOST   http://cluster.svc:9093
                                 *
mgr           advanced  mgr/dashboard/GRAFANA_API_SSL_VERIFY  false
                                 *
mgr           advanced  mgr/dashboard/GRAFANA_API_URL         https://cluster.svc:3000

mgr           advanced  mgr/dashboard/PROMETHEUS_API_HOST     http://cluster.svc:9095
                                 *
mgr           advanced  mgr/dashboard/RGW_API_ACCESS_KEY      {"default": "HKCWLKQKXS1G1L2BV7A4"}
                                 *
mgr           advanced  mgr/dashboard/RGW_API_SECRET_KEY      {"default": "qGCwWDjirgdCLzoIOhdtJvMnWiPqOWwIaGoZofSw"}                                    *
mgr           advanced  mgr/dashboard/ssl_server_port         8443
                                 *
global        basic     mgr/orchestrator/orchestrator         cephadm

修改地址

ceph dashboard set-alertmanager-api-host https://cluster.svc:9093
ceph dashboard set-grafana-api-url https://cluster.svc:3000
ceph dashboard set-prometheus-api-host https://cluster.svc:9095
# 或者
ceph config set mgr mgr/dashboard/ALERTMANAGER_API_HOST https://cluster.svc:9093
ceph config set mgr mgr/dashboard/GRAFANA_API_URL https://cluster.svc:3000
ceph config set mgr mgr/dashboard/PROMETHEUS_API_HOST https://cluster.svc:9095

2. 部署到k8s

2.0 准备工作（服务端执行）

# 确保ceph正常运行
ceph -s
# 确保存在mon
ceph mon dump
# 获取key
ceph auth get client.admin
# 创建data池
ceph osd pool create test 8 8
# 创建元数据池
ceph osd pool create cephfs_metadata 8 8
# 关联元数据与data
ceph fs new cephfs cephfs_metadata test
# 创建子卷
ceph fs subvolumegroup create cephfs csi 
# volume代表一个文件系统卷，subvolume可以理解成volume下的文件夹
# volumegroup与subvolumegroup可以对volume或subvolume进行方便的权限管理
# https://elrond.wang/2021/08/16/CephFS-subvolume/

子卷调整配额

管理CephFS：创建、删除及操作子卷、快照和子卷组-CSDN博客

4.3. Ceph 文件系统子卷 | Red Hat Product Documentation

2.1 下载csi

git clone git@github.com:ceph/ceph-csi.git --depth=1
cd ceph-csi/deploy/cephfs/kubernetes

2.2 修改配置文件

2.2.1 修改conf

其中，clusterID是集群ID，可以通过在服务器端命令ceph -s获得，mon信息可以通过命令ceph mon dump获得

# cat csi-config-map.yaml
---
apiVersion: v1
kind: ConfigMap
data:
  config.json: |-
    [
      {
        "clusterID": "c7b4xxf7-c61e-4668-9xx0-82c9xx5e3696",
        "monitors": [
          "xxx.xxx.xxx.xxx:3300", # v2方式
          "xxx.xxx.xxx.xxx:6789" # v1方式
        ]
      }
    ]
metadata:
  name: ceph-csi-config

# sudo ceph -s
  cluster:
    id:     f020f9e9-f8da-11ef-9430-4eecb663651b
    health: HEALTH_OK

  services:
    mon: 1 daemons, quorum ubuntu-vm-2404-test-1 (age 22h)
    mgr: ubuntu-vm-2404-test-1.suvfjb(active, since 22h), standbys: ubuntu-vm-2404-test-2.ppaxtx
    mds: 1/1 daemons up
    osd: 6 osds: 6 up (since 19h), 6 in (since 19h)

  data:
    volumes: 1/1 healthy
    pools:   3 pools, 145 pgs
    objects: 278 objects, 1001 MiB
    usage:   3.4 GiB used, 53 GiB / 57 GiB avail
    pgs:     145 active+clean

# sudo ceph mon dump
epoch 1
fsid f020f9e9-f8da-11ef-9430-4eecb663651b
last_changed 2025-03-04T09:28:10.173718+0000
created 2025-03-04T09:28:10.173718+0000
min_mon_release 19 (squid)
election_strategy: 1
0: [v2:10.244.0.228:3300/0,v1:10.244.0.228:6789/0] mon.ubuntu-vm-2404-test-1
dumped monmap epoch 1

2.2.2 修改secret

其中，userKey和adminKey都可通过命令ceph auth get client.admin获得

# cat secret.yaml
---
apiVersion: v1
kind: Secret
metadata:
  name: csi-cephfs-secret
  namespace: default
stringData:
  # Required for statically provisioned volumes
  userID: admin
  userKey: AQBg4llf+9CAGdsAds4tQzS+0O7dscB5ZTiTEQ==

  # Required for dynamically provisioned volumes
  adminID: admin
  adminKey: AQBg4llf+9CAGdsAds4tQzS+0O7dscB5ZTiTEQ==

# sudo ceph auth get client.admin
[client.admin]
        key = AQCpx8Zn2nTWMxAAvqX4K3Limi6qYmqh9XKTsw==
        caps mds = "allow *"
        caps mgr = "allow *"
        caps mon = "allow *"
        caps osd = "allow *"

如果是rbd，那么类似

apiVersion: v1
kind: Secret
metadata:
  name: csi-rbd-secret
stringData:
  # Required for statically provisioned volumes
  userID: admin
  userKey: AQDDv9pnTzbFBhAAPal5qxNBNq3KFMRXbaWvMg==

  # Required for dynamically provisioned volumes
  adminID: admin
  adminKey: AQDDv9pnTzbFBhAAPal5qxNBNq3KFMRXbaWvMg==

2.2.3 创建剩余的配置文件

vim csi-config-map-kms.yaml
---
apiVersion: v1
kind: ConfigMap
data:
  config.json: |-
    {}
metadata:
  name: ceph-csi-encryption-kms-config

2.2.4 创建sc

其中，fsName是文件系统名称，pool要对应到一个数据类型的池中，clusterID与前面配置一致，mountOptions要去掉，不然pod会挂载不上，reclaimPolicy表示当pod删除后对应的文件是否要删除。添加kernelMountOptions: ms_mode=prefer-crc后即可启用v2连接。

用法详见ceph-csi/examples/cephfs/storageclass.yaml at devel · ceph/ceph-csi · GitHub

# cat storageclass.yaml
---
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
  name: csi-cephfs-sc
provisioner: cephfs.csi.ceph.com
parameters:
  clusterID: c7b43ef7-c61e-4668-9970-82c9775e3696
  fsName: cephfs
  pool: test
  kernelMountOptions: ms_mode=prefer-crc
  csi.storage.k8s.io/provisioner-secret-name: csi-cephfs-secret
  csi.storage.k8s.io/provisioner-secret-namespace: default
  csi.storage.k8s.io/controller-expand-secret-name: csi-cephfs-secret
  csi.storage.k8s.io/controller-expand-secret-namespace: default
  csi.storage.k8s.io/node-stage-secret-name: csi-cephfs-secret
  csi.storage.k8s.io/node-stage-secret-namespace: default
reclaimPolicy: Delete
allowVolumeExpansion: true
# mountOptions:
#   - discard

rbd创建的sc如下：

---
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
   name: csi-rbd-sc
provisioner: rbd.csi.ceph.com
# If topology based provisioning is desired, delayed provisioning of
# PV is required and is enabled using the following attribute
# For further information read TODO<doc>
# volumeBindingMode: WaitForFirstConsumer
parameters:
   # (required) String representing a Ceph cluster to provision storage from.
   # Should be unique across all Ceph clusters in use for provisioning,
   # cannot be greater than 36 bytes in length, and should remain immutable for
   # the lifetime of the StorageClass in use.
   # Ensure to create an entry in the configmap named ceph-csi-config, based on
   # csi-config-map-sample.yaml, to accompany the string chosen to
   # represent the Ceph cluster in clusterID below
   clusterID: 06be027c-04c2-11f0-ace2-246e96a3ad74

   # (optional) If you want to use erasure coded pool with RBD, you need to
   # create two pools. one erasure coded and one replicated.
   # You need to specify the replicated pool here in the `pool` parameter, it is
   # used for the metadata of the images.
   # The erasure coded pool must be set as the `dataPool` parameter below.
   # dataPool: <ec-data-pool>

   # (required) Ceph pool into which the RBD image shall be created
   # (optional) If the topologyConstrainedPools is provided
   # eg: pool: rbdpool
   pool: rbd_pool

   # (optional) RBD image features, CSI creates image with image-format 2 CSI
   # RBD currently supports `layering`, `journaling`, `exclusive-lock`,
   # `object-map`, `fast-diff`, `deep-flatten` features.
   # Refer https://docs.ceph.com/en/latest/rbd/rbd-config-ref/#image-features
   # for image feature dependencies.
   # imageFeatures: layering,journaling,exclusive-lock,object-map,fast-diff
   imageFeatures: "layering"

   # (optional) Options to pass to the `mkfs` command while creating the
   # filesystem on the RBD device. Check the man-page for the `mkfs` command
   # for the filesystem for more details. When `mkfsOptions` is set here, the
   # defaults will not be used, consider including them in this parameter.
   #
   # The default options depend on the csi.storage.k8s.io/fstype setting:
   # - ext4: "-m0 -Enodiscard,lazy_itable_init=1,lazy_journal_init=1"
   # - xfs: "-K"
   #
   # mkfsOptions: "-m0 -Ediscard -i1024"

   # (optional) Specifies whether to try other mounters in case if the current
   # mounter fails to mount the rbd image for any reason. True means fallback
   # to next mounter, default is set to false.
   # Note: tryOtherMounters is currently useful to fallback from krbd to rbd-nbd
   # in case if any of the specified imageFeatures is not supported by krbd
   # driver on node scheduled for application pod launch, but in the future this
   # should work with any mounter type.
   # tryOtherMounters: false

   # (optional) mapOptions is a comma-separated list of map options.
   # For krbd options refer
   # https://docs.ceph.com/docs/master/man/8/rbd/#kernel-rbd-krbd-options
   # For nbd options refer
   # https://docs.ceph.com/docs/master/man/8/rbd-nbd/#options
   # Format:
   # mapOptions: "<mounter>:op1,op2;<mounter>:op1,op2"
   # An empty mounter field is treated as krbd type for compatibility.
   # eg:
   # mapOptions: "krbd:lock_on_read,queue_depth=1024;nbd:try-netlink"

   # (optional) unmapOptions is a comma-separated list of unmap options.
   # For krbd options refer
   # https://docs.ceph.com/docs/master/man/8/rbd/#kernel-rbd-krbd-options
   # For nbd options refer
   # https://docs.ceph.com/docs/master/man/8/rbd-nbd/#options
   # Format:
   # unmapOptions: "<mounter>:op1,op2;<mounter>:op1,op2"
   # An empty mounter field is treated as krbd type for compatibility.
   # eg:
   # unmapOptions: "krbd:force;nbd:force"

   # The secrets have to contain Ceph credentials with required access
   # to the 'pool'.
   csi.storage.k8s.io/provisioner-secret-name: csi-rbd-secret
   csi.storage.k8s.io/provisioner-secret-namespace: ceph-storage
   csi.storage.k8s.io/controller-expand-secret-name: csi-rbd-secret
   csi.storage.k8s.io/controller-expand-secret-namespace: ceph-storage
   csi.storage.k8s.io/node-stage-secret-name: csi-rbd-secret
   csi.storage.k8s.io/node-stage-secret-namespace: ceph-storage

   # (optional) Specify the filesystem type of the volume. If not specified,
   # csi-provisioner will set default as `ext4`.
   csi.storage.k8s.io/fstype: ext4

   # (optional) uncomment the following to use rbd-nbd as mounter
   # on supported nodes
   # mounter: rbd-nbd

   # (optional) ceph client log location, eg: rbd-nbd
   # By default host-path /var/log/ceph of node is bind-mounted into
   # csi-rbdplugin pod at /var/log/ceph mount path. This is to configure
   # target bindmount path used inside container for ceph clients logging.
   # See docs/design/proposals/rbd-nbd.md for available configuration options.
   # cephLogDir: /var/log/ceph

   # (optional) ceph client log strategy
   # By default, log file belonging to a particular volume will be deleted
   # on unmap, but you can choose to just compress instead of deleting it
   # or even preserve the log file in text format as it is.
   # Available options `remove` or `compress` or `preserve`
   # cephLogStrategy: remove

   # (optional) Prefix to use for naming RBD images.
   # If omitted, defaults to "csi-vol-".
   # volumeNamePrefix: "foo-bar-"

   # (optional) Instruct the plugin it has to encrypt the volume
   # By default it is disabled. Valid values are "true" or "false".
   # A string is expected here, i.e. "true", not true.
   # encrypted: "true"

   # (optional) Select the encryption type when encrypted: "true" above.
   # Valid values are:
   #   "file": Enable file encryption on the mounted filesystem
   #   "block": Encrypt RBD block device
   # When unspecified assume type "block". "file" and "block" are
   # mutually exclusive.
   # encryptionType: "block"

   # (optional) Use external key management system for encryption passphrases by
   # specifying a unique ID matching KMS ConfigMap. The ID is only used for
   # correlation to configmap entry.
   # encryptionKMSID: <kms-config-id>

   # Add topology constrained pools configuration, if topology based pools
   # are setup, and topology constrained provisioning is required.
   # For further information read TODO<doc>
   # topologyConstrainedPools: |
   #   [{"poolName":"pool0",
   #     "dataPool":"ec-pool0" # optional, erasure-coded pool for data
   #     "domainSegments":[
   #       {"domainLabel":"region","value":"east"},
   #       {"domainLabel":"zone","value":"zone1"}]},
   #    {"poolName":"pool1",
   #     "dataPool":"ec-pool1" # optional, erasure-coded pool for data
   #     "domainSegments":[
   #       {"domainLabel":"region","value":"east"},
   #       {"domainLabel":"zone","value":"zone2"}]},
   #    {"poolName":"pool2",
   #     "dataPool":"ec-pool2" # optional, erasure-coded pool for data
   #     "domainSegments":[
   #       {"domainLabel":"region","value":"west"},
   #       {"domainLabel":"zone","value":"zone1"}]}
   #   ]

   # Image striping, Refer https://docs.ceph.com/en/latest/man/8/rbd/#striping
   # For more details
   # (optional) stripe unit in bytes.
   # stripeUnit: <>
   # (optional) objects to stripe over before looping.
   # stripeCount: <>
   # (optional) The object size in bytes.
   # objectSize: <>

   # rbd volume QoS.
   # QoS provides settings for rbd volume read/write iops
   # and read/write bandwidth. There are 4 base qos parameters
   # among them, when users apply for a volume capacity equal
   # to or less than BaseVolSizebytes, use base qos limit.
   # For the portion of capacity exceeding BaseVolSizebytes,
   # QoS will be increased in steps set per GiB. If the step
   # size parameter per GiB is not provided, only base QoS limit
   # will be used and not associated with capacity size.
   #
   # note: currently supports rbd-nbd mounter.
   #
   # For more details
   # (optional) the base limit of read operations per second.
   # BaseReadIops: <>
   # (optional) the base limit of write operations per second.
   # BaseWriteIops: <>
   # (optional) the base limit of read bytes per second.
   # BaseReadBytesPerSecond: <>
   # (optional) the base limit of write bytes per second.
   # BaseWriteBytesPerSecond: <>
   # (optional) the limit of read operations per GiB.
   # ReadIopsPerGiB: <>
   # (optional) the limit of write operations per GiB.
   # WriteIopsPerGiB: <>
   # (optional) the limit of read bytes per GiB.
   # ReadBpsPerGiB: <>
   # (optional) the limit of write bytes per GiB.
   # WriteBpsPerGiB: <>
   # (optional) min size of volume what use to calc qos beased on capacity.
   # BaseVolSizeBytes:<>
reclaimPolicy: Delete
allowVolumeExpansion: true

# If filesystem is xfs, nouuid will be automatically added to mountOptions
mountOptions:
   - discard

2.2.5 创建pvc

# vim pvc.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: csi-cephfs-pvc
spec:
  accessModes:
    - ReadWriteMany
  resources:
    requests:
      storage: 1Gi
  storageClassName: csi-cephfs-sc

2.2.6 应用

cd ceph-csi/deploy/cephfs/kubernetes
k apply -f ../../ceph-conf.yaml # 必须应用此文件
k apply -f ./ # 应用全部创建csi，包括deployment等
k apply -f csi-config-map.yaml
k apply -f csi-config-map-kms.yaml
k apply -f secret.yaml
k apply -f storageclass.yaml
k apply -f pvc.yaml

2.2.7 创建pod

# cat pod.yaml
---
apiVersion: v1
kind: Pod
metadata:
  name: csi-cephfs-demo-pod
spec:
  containers:
    - name: web-server
      image: nginx
      volumeMounts:
        - name: mypvc
          mountPath: /var/lib/www
  volumes:
    - name: mypvc
      persistentVolumeClaim:
        claimName: csi-cephfs-pvc
        readOnly: false

应用，即可看到挂载。对应的ceph路径为/volumes/csi/csi-vol-035561d6-1f49-4477-9c6d-794382609b66/9247952c-12bd-4ecb-8845-1e2ec3bf1066/

10.244.0.228:6789:/volumes/csi/csi-vol-035561d6-1f49-4477-9c6d-794382609b66/9247952c-12bd-4ecb-8845-1e2ec3bf1066  1.0G     0  1.0G   0% /var/lib/www

2.3 镜像列表

registry.k8s.io/sig-storage/csi-snapshotter:v8.2.0
registry.k8s.io/sig-storage/csi-resizer:v1.13.1
registry.k8s.io/sig-storage/csi-provisioner:v5.1.0
registry.k8s.io/sig-storage/csi-node-driver-registrar:v2.13.0
quay.io/cephcsi/cephcsi:canary

2.4 负载均衡

2.4.1 保证每台节点均有rgw与mds服务

2.4.2 修改coredns

kubectl edit configmap coredns -n kube-system
# 加入如下中的hosts段：
Corefile: |
    .:53 {
        errors
        health {
           lameduck 5s
        }
        hosts {
          10.144.96.10 k10 postgres.service.com s3.service.com
          10.144.96.11 k11 postgres.service.com s3.service.com
          10.144.96.12 k12 postgres.service.com s3.service.com
          fallthrough
        }
        ready
        kubernetes cluster.local in-addr.arpa ip6.arpa {
           pods insecure
           fallthrough in-addr.arpa ip6.arpa
           ttl 30
        }

2.5 选型

2.5.1 Cephfs

优点
- 读取延迟低,I/O带宽表现良好,尤其是block size较大一些的文件
- 灵活度高,支持k8s的所有接入模式
缺点
- 写入延迟相对较高且延迟时间不稳定
适用场景
- 适用于要求灵活度高(支持k8s多节点挂载特性),对I/O延迟不甚敏感的文件读写操作,以及非海量的小文件存储支持.例如作为常用的应用/中间件挂载存储后端.

2.5.2 Ceph RBD

优点
- I/O带宽表现良好
- 读写延迟都很低
- 支持镜像快照,镜像转储
缺点
- 不支持多节点挂载
适用场景
- 对I/O带宽和延迟要求都较高,且无多个节点同时读写数据需求的应用,例如数据库

2.5.3 测试

2.5.3.1 可用工具

sysbench

akopytov/sysbench: Scriptable database and system performance benchmark

2.5.3.2 写入一个大文件

# dd if=/dev/zero  of=test bs=1M count=2048
2048+0 records in
2048+0 records out
2147483648 bytes (2.1 GB, 2.0 GiB) copied, 1.8003 s, 1.2 GB/s

# dd if=/dev/zero  of=test bs=1M count=2048
2048+0 records in
2048+0 records out
2147483648 bytes (2.1 GB, 2.0 GiB) copied, 1.86057 s, 1.2 GB/s

2.5.3.3 写入一万个小文件

# time seq 10000 | xargs -i dd if=/dev/zero of={}.dat bs=1024 count=1
real    0m12.695s
user    0m2.479s
sys     0m10.661s

# time seq 10000 | xargs -i dd if=/dev/zero of={}.dat bs=1024 count=1
real    0m20.365s
user    0m3.552s
sys     0m10.236s

3. 其他命令

sudo ceph orch host ls
sudo ceph osd map test sys.txt
sudo ceph orch ls
sudo ceph mds metadata
sudo ceph health detail
sudo ceph -s
sudo ceph orch stop mds.test
sudo rados df
radosgw-admin user create --uid=s3 --display-name="object_storage" --system
ceph orch device ls
sgdisk --zap-all /dev/nvme0n1
fdisk