自建基于Ceph-RGW的NFS高可用集群
自建基于Ceph-RGW的NFS高可用集群
应总工程师的的要求,搭建基于对象存储的NFS高可用集群。
1. 配置环境
新建三台虚拟机,每台虚拟机有三块块硬盘(两块用于做OSD)、两个网卡,分属两个网段,规划其IP分别为:
- node1 public:192.168.40.66; cluster:172.16.6.66
- node2 public:192.168.40.67; cluster:172.16.6.67
- node3 public:192.168.40.68; cluster:172.16.6.68
修改所有虚拟机的hosts
[root@node1 ~]# vim /etc/hosts
192.168.40.66 node1
192.168.40.67 node2
192.168.40.68 node3
SSH设置免密登录
免密登录
[root@node1 ~]# ssh-copy-id root@node01 # 若提示没有秘钥,就先用ssh-keygen生成秘钥
[root@node1 ~]# ssh-copy-id root@node02
[root@node1 ~]# ssh-copy-id root@node03
内核升级
官方推荐内核版本4.x以上
根据下面的内容进行内核升级。
[root@node1 ~]# rpm --import https://www.elrepo.org/RPM-GPG-KEY-elrepo.org # 导入ELRepo仓库的公共密钥
[root@node1 ~]# yum install https://www.elrepo.org/elrepo-release-7.el7.elrepo.noarch.rpm # 安装ELRepo仓库的yum源
[root@node1 ~]# yum --disablerepo="*" --enablerepo="elrepo-kernel" list available # 查看可用系统内核包,可以看到5.4和5.16两个版本
[root@node1 ~]# yum --enablerepo=elrepo-kernel install kernel-ml # --enablerepo 选项开启 CentOS 系统上的指定仓库。默认开启的是 elrepo,这里用 elrepo-kernel 替换。
# 内核安装好后,需要设置为默认启动选项并重启后才会生效
[root@node1 ~]# awk -F\' '$1=="menuentry " {print i++ " : " $2}' /etc/grub2.cfg # 查看系统上的所有可用内核:
[root@node1 ~]# grub2-set-default 0 # 其中 0 是上面查询出来的可用内核
[root@node1 ~]# grub2-mkconfig -o /boot/grub2/grub.cfg # 生成 grub 配置文件
[root@node1 ~]# reboot # 重启
[root@node1 ~]# uname -r # 验证
关闭防火墙和SELinux
[root@node1 ~]# systemctl stop firewalld
[root@node1 ~]# systemctl disable firewalld
[root@node1 ~]# setenforce 0
[root@node1 ~]# vi /etc/selinux/config
修改SELINUX=disabled
SELINUX=disabled
或者直接运行以下命令
[root@node1 ~]# sed -i 's/=enforcing/=disabled/' /etc/selinux/config
时间同步
在所有节点上安装chrony
yum -y install chrony
在node1节点上配置chrony服务
[root@node1 ~]# vim /etc/chrony.conf
server ntp.aliyun.com iburst # 注释掉其他server
......
#allow 192.168.0.0/16
allow 192.168.40.0/24 # 添加允许访问的网段
[root@node1 ~]# systemctl enable chronyd
[root@node1 ~]# systemctl start chronyd
node2、node3删除其他server,只有一个server
[root@node2 ~]# vim /etc/chrony.conf
......
server 192.168.40.66 iburst
[root@node2 ~]# systemctl enable chronyd
[root@node2 ~]# systemctl start chronyd
[root@node2 ~]# chronyc sources -v
210 Number of sources = 1
...
MS Name/IP address Stratum Poll Reach LastRx Last sample
===============================================================================
^? node1 0 8 0 - +0ns[ +0ns] +/- 0ns
配置yum源
所有节点都要配置。
[root@node1 ~]# yum install -y epel-release
[root@node1 ~]# vim /etc/yum.repos.d/ceph.repo
[Ceph]
name=Ceph packages for $basearch
baseurl=http://mirrors.aliyun.com/ceph/rpm-nautilus/el7/x86_64/
enabled=1
gpgcheck=0
type=rpm-md
gpgkey=https://download.ceph.com/keys/release.asc
[Ceph-noarch]
name=Ceph noarch packages
baseurl=http://mirrors.aliyun.com/ceph/rpm-nautilus/el7/noarch
enabled=1
gpgcheck=0
type=rpm-md
gpgkey=https://download.ceph.com/keys/release.asc
[ceph-source]
name=Ceph source packages
baseurl=http://mirrors.aliyun.com/ceph/rpm-nautilus/el7/SRPMS
enabled=1
gpgcheck=0
type=rpm-md
gpgkey=https://download.ceph.com/keys/release.asc
[root@node1 ~]# yum clean all && yum makecache
[root@node1 ~]# yum update
[root@node1 ~]# yum install ceph-deploy -y # 只在主节点安装
[root@node1 ~]# yum install -y ceph ceph-mon ceph-mgr ceph-mgr-dashboard ceph-radosgw ceph-mds ceph-osd # 在所有节点上都要安装
2. 安装和配置Ceph
nfs-ganesha配置样例
[root@node1 ~]# mkdir -p /root/my-cluster # 用户存放Ceph最初的配置文件
[root@node1 ~]# cd ~/my-cluster
[root@node1 my-cluster]# ceph-deploy new --public-network 192.168.40.0/24 --cluster-network 172.16.6.0/24 node1 # 创建一个ceph集群,mon为node1
[root@node1 my-cluster]# ceph-deploy mon create-initial # 初始化monitor
[root@node1 my-cluster]# ceph-deploy admin node1 node2 node3 # 给节点分配秘钥和配置文件
[root@node1 my-cluster]# ceph-deploy mgr create node1 node2 node3 # 配置配置Manager节点
[root@node1 my-cluster]# ceph-deploy mon create node2 node3 # 扩展mon节点
[root@node1 my-cluster]# ceph -s
[root@node1 my-cluster]# ceph-deploy osd create node1 --data /dev/sdb # 添加OSD,事先用lsblk确认对应的盘符
[root@node1 my-cluster]# ceph-deploy osd create node2 --data /dev/sdb
[root@node1 my-cluster]# ceph-deploy osd create node3 --data /dev/sdb
[root@node1 my-cluster]# ceph-deploy osd create node1 --data /dev/sdc
[root@node1 my-cluster]# ceph-deploy osd create node2 --data /dev/sdc
[root@node1 my-cluster]# ceph-deploy osd create node3 --data /dev/sdc
[root@node1 my-cluster]# ceph osd tree # 确认OSD状态
[root@node1 my-cluster]# ceph -s # 确认ceph集群的状态
[root@node1 my-cluster]# ceph mgr module enable dashboard # 开启dashboard
[root@node1 my-cluster]# ceph dashboard create-self-signed-cert # 创建证书
[root@node1 my-cluster]# echo 123456 >pass.txt
[root@node1 my-cluster]# ceph dashboard set-login-credentials admin -i pass.txt # 创建 web 登录用户密码
[root@node1 my-cluster]# ceph mgr services # 查看服务访问方式
安装dashboard后就可以用 https://192.168.40.66:8443/ 这个地址来查看ceph集群的状态了。
3. 配置S3服务
[root@node1 my-cluster]# ceph-deploy rgw create node1 node2 node3
[root@node1 my-cluster]# ceph -s
创建存储池
[root@node1 my-cluster]# ceph osd crush class ls
[
"hdd"
]
[root@node1 my-cluster]# ceph osd crush rule create-replicated rule-hdd default host hdd
[root@node1 my-cluster]# ceph osd crush rule ls
replicated_rule
rule-hdd
[root@node1 my-cluster]# ceph osd pool create default.rgw.buckets.data 128 128 # 创建存储池,default.rgw.buckets.index已经存在,不需重复创建
[root@node1 my-cluster]# ceph osd pool application enable default.rgw.buckets.data rgw
[root@node1 my-cluster]# ceph osd pool application enable default.rgw.buckets.index rgw
修改所有存储池的crush规则,在node1上执行:
[root@node1 my-cluster]# for i in `ceph osd lspools | grep -v data | awk '{print $2}'`; do ceph osd pool set $i crush_rule rule-hdd; done
[root@node1 my-cluster]# ceph osd pool set default.rgw.buckets.data crush_rule rule-hdd
[root@node1 my-cluster]# radosgw-admin user create --uid="s3admin" --display-name="First User" # 创建一个S3用户
{
"user_id": "s3admin",
"display_name": "First User",
"email": "",
"suspended": 0,
"max_buckets": 1000,
"subusers": [],
"keys": [
{
"user": "s3admin",
"access_key": "I1H2PSMU31FK95A49PX8",
"secret_key": "tzcHbjXAPWrV9qHtLSHxQ63BhR55XfBc2Q8DBaBX"
}
],
"swift_keys": [],
"caps": [],
"op_mask": "read, write, delete",
"default_placement": "",
"default_storage_class": "",
"placement_tags": [],
"bucket_quota": {
"enabled": false,
"check_on_raw": false,
"max_size": -1,
"max_size_kb": 0,
"max_objects": -1
},
"user_quota": {
"enabled": false,
"check_on_raw": false,
"max_size": -1,
"max_size_kb": 0,
"max_objects": -1
},
"temp_url_keys": [],
"type": "rgw",
"mfa_ids": []
}
[root@node1 my-cluster]# yum install python-boto # 安装python-boto模块
[root@node1 my-cluster]# vi s3test.py # 写个简单的脚本,创建bucket
import boto.s3.connection
access_key = 'I1H2PSMU31FK95A49PX8'
secret_key = 'tzcHbjXAPWrV9qHtLSHxQ63BhR55XfBc2Q8DBaBX'
conn = boto.connect_s3(
aws_access_key_id=access_key,
aws_secret_access_key=secret_key,
host='node1', port=7480,
is_secure=False, calling_format=boto.s3.connection.OrdinaryCallingFormat(),
)
bucket = conn.create_bucket('bucker1')
for bucket in conn.get_all_buckets():
print "{name} {created}".format(
name=bucket.name,
created=bucket.creation_date,
)
[root@node1 my-cluster]# [root@node1 my-cluster]# python s3.py
bucker1 2022-01-21T01:39:11.186Z # 创建bucket成功。
也可以使用s3 browser,进行bucket的创建删除及对象上传下载的测试。
4.安装和配置nfs-ganesha
所有节点上都要安装nfs-ganesha。
[root@node1 my-cluster]# vim /etc/yum.repos.d/nfs-ganasha.repo
[nfsganesha]
name=nfsganesha
baseurl=https://mirrors.cloud.tencent.com/ceph/nfs-ganesha/rpm-V2.8-stable/nautilus/x86_64/
gpgcheck=0
enable=1
[root@node1 my-cluster]# yum makecache
[root@node1 my-cluster]# yum install -y nfs-ganesha nfs-ganesha-ceph nfs-ganesha-rados-grace nfs-ganesha-rgw nfs-utils rpcbind haproxy keepalived
[root@node1 my-cluster]# ceph-deploy --overwrite-conf admin node1 node2 node3 # 更新配置文件
[root@node1 my-cluster]# vim /etc/ganesha/ganesha.conf # 配置以下内容
NFS_CORE_PARAM {
mount_path_pseudo = true;
NFS_Port = 62049;
Enable_RQUOTA = false;
}
EXPORT_DEFAULTS {
Access_Type = RW;
Anonymous_uid = 65534;
Anonymous_gid = 65534;
}
LOG {
Default_Log_Level = INFO;
Facility {
name = FILE;
destination = "/var/log/ganesha/ganesha.log";
enable = active;
}
}
RGW {
name = client.rgw.node1; # node2和 node3上修改为对应的名字
cluster = ceph;
}
NFSv4 {
# pnfs_mds = true;
# pnfs_ds = true;
}
%include /etc/ganesha/nfs-1.conf
[root@node1 my-cluster]# vim /etc/ganesha/nfs-1.conf
EXPORT
{
Export_Id = 1;
Path = bucket1;
Pseudo = /bucket1;
Squash = no_root_squash;
Access_Type = RW;
# Protocols = 4; # 这条配置会导致无法正常export nfs,原因不明。
# Transports = TCP;
FSAL {
Name = RGW;
User_Id = "s3admin";
Access_Key_Id = "I1H2PSMU31FK95A49PX8";
Secret_Access_Key = "tzcHbjXAPWrV9qHtLSHxQ63BhR55XfBc2Q8DBaBX";
}
}
[root@node1 my-cluster]# systemctl start nfs-ganesha
[root@node1 my-cluster]# systemctl enable nfs-ganesha
[root@node1 my-cluster]# scp /etc/ganesha/ganesha.conf root@node2:/etc/ganesha/ganesha.conf # 拷贝配置文件到另外两个节点上,记得修改RGW中name的值
[root@node1 my-cluster]# scp /etc/ganesha/ganesha.conf root@node3:/etc/ganesha/ganesha.conf
[root@node1 my-cluster]# ssh root@node2 systemctl start nfs-ganesha # 2、 3节点重启nfs-ganesha 服务
[root@node1 my-cluster]# ssh root@node3 systemctl start nfs-ganesha
[root@node1 my-cluster]# ssh root@node2 systemctl enable nfs-ganesha
[root@node1 my-cluster]# ssh root@node3 systemctl enable nfs-ganesha
[root@node1 my-cluster]# systemctl status nfs-ganesha
[root@node1 my-cluster]# showmount -e node1
Export list for node1:
/bucket1 (everyone)
5. 配置haproxy
[root@node1 my-cluster]# echo "net.ipv4.ip_nonlocal_bind = 1" >> /etc/sysctl.conf
[root@node1 my-cluster]# sysctl -p # 在每一个节点上运行上两条命令,其作用是允许bind一个不存在的IP
[root@node1 my-cluster]# vim /etc/haproxy/haproxy.cfg
global
log 127.0.0.1 local2
chroot /var/lib/haproxy
pidfile /var/run/haproxy.pid
maxconn 8000
user haproxy
group haproxy
daemon
stats socket /var/lib/haproxy/stats
defaults
mode http
log global
option httplog
option dontlognull
option http-server-close
# option forwardfor except 127.0.0.0/8
option redispatch
retries 3
timeout http-request 10s
timeout queue 1m
timeout connect 10s
timeout client 1m
timeout server 1m
timeout http-keep-alive 10s
timeout check 10s
maxconn 8000
listen stats
bind *:9090
mode http
stats enable
stats uri /
stats refresh 5s
stats realm Haproxy\ Stats
stats auth admin:admin
frontend nfs-in
bind 192.168.40.69:2049
mode tcp
option tcplog
default_backend nfs-back
frontend s3-in
bind 192.168.40.69:8080
mode tcp
option tcplog
default_backend s3-back
frontend dashboard-in
bind 192.168.40.69:8888
mode tcp
option tcplog
default_backend dashboard-back
backend nfs-back
balance source
mode tcp
log /dev/log local0 debug
server node1 192.168.40.66:62049 check
server node2 192.168.40.67:62049 check
server node3 192.168.40.68:62049 check
backend s3-back
balance source
mode tcp
log /dev/log local0 debug
server node1 192.168.40.66:7480 check
server node2 192.168.40.67:7480 check
server node3 192.168.40.68:7480 check
backend dashboard-back
balance source
mode tcp
log /dev/log local0 debug
server node1 192.168.40.66:8443 check
server node2 192.168.40.67:8443 check
server node3 192.168.40.68:8443 check
[root@node1 my-cluster]# systemctl start haproxy
[root@node1 my-cluster]# systemctl enable haproxy
[root@node1 my-cluster]# scp /etc/haproxy/haproxy.cfg root@node2:/etc/haproxy/haproxy.cfg
[root@node1 my-cluster]# ssh root@node2 systemctl start haproxy
[root@node1 my-cluster]# ssh root@node2 systemctl enable haproxy
[root@node1 my-cluster]# scp /etc/haproxy/haproxy.cfg root@node3:/etc/haproxy/haproxy.cfg
[root@node1 my-cluster]# ssh root@node3 systemctl start haproxy
[root@node1 my-cluster]# ssh root@node3 systemctl enable haproxy
6. 配置keepalived
配置抢占式keepalived,node1为MASTER,优先级200,node2/3为BACKUP,优先级分别为150、100
[root@node1 my-cluster]# vim /etc/keepalived/keepalived.conf
global_defs {
router_id CEPH_NFS # 标识信息,随便写;
}
vrrp_script check_haproxy { # 要执行的脚本
script "killall -0 haproxy"
weight -20
interval 2
rise 2
fall 2
}
vrrp_instance VI_0 {
state MASTER # node1为MASTER,其余节点要修改为BACKUP
priority 200 # 优先级,优先级高的优先获取VIP,其余两个节点分别设置150和100
interface ens192 # 定义网络接口
virtual_router_id 51 # 三个节点都要是51
advert_int 1
authentication {
auth_type PASS
auth_pass 1234
}
virtual_ipaddress {
192.168.40.69/24 dev ens192 # 虚拟IP
}
track_script {
check_haproxy
}
}
[root@node1 my-cluster]# systemctl start keepalived
[root@node1 my-cluster]# systemctl enable keepalived
记得修改节点2/3的keepalived配置
[root@node1 my-cluster]# ssh root@node2 systemctl start keepalived
[root@node1 my-cluster]# ssh root@node2 systemctl enable keepalived
[root@node1 my-cluster]# ssh root@node3 systemctl start keepalived
[root@node1 my-cluster]# ssh root@node3 systemctl enable keepalived
7. 测试、验证高可用
- NFS
找一台测试机,将nfs挂载,测试其高可用。
虚拟IP可以正常迁移,NFS能够达到基本的高可用状态。
但是测试中发现了IO error现象,目前正在排查问题。
在官方的ISSUE中也有类似的情况。
现在看来,是RGW本身的问题,无法提供一个完整的POSIX文件系统。
- s3
通过s3 browser上传文件,重启node1,上传进程会卡几秒,之后上传继续。
S3服务达到了基本的高可用。