Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions group_vars/all.yml.sample
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ dummy:
#
#ceph_mirror: https://download.ceph.com
#ceph_stable_key: https://download.ceph.com/keys/release.asc
#ceph_stable_release: squid
#ceph_stable_release: tentacle
#ceph_stable_repo: "{{ ceph_mirror }}/debian-{{ ceph_stable_release }}"

#nfs_ganesha_stable: true # use stable repos for nfs-ganesha
Expand Down Expand Up @@ -584,7 +584,7 @@ dummy:
#dashboard_disabled_features: []
#prometheus_frontend_vip: ''
#alertmanager_frontend_vip: ''
#node_exporter_container_image: "docker.io/prom/node-exporter:v0.17.0"
#node_exporter_container_image: "quay.io/prometheus/node-exporter:v1.9.1"
#node_exporter_port: 9100
#grafana_admin_user: admin
# This variable must be set with a strong custom password when dashboard_enabled is True
Expand All @@ -594,7 +594,7 @@ dummy:
#grafana_key: ''
# When using https, please fill with a hostname for which grafana_crt is valid.
#grafana_server_fqdn: ''
#grafana_container_image: "docker.io/grafana/grafana:6.7.4"
#grafana_container_image: "quay.io/ceph/grafana:12.3.1"
#grafana_container_cpu_period: 100000
#grafana_container_cpu_cores: 2
# container_memory is in GB
Expand Down Expand Up @@ -624,7 +624,7 @@ dummy:
#grafana_port: 3000
#grafana_network: "{{ public_network }}"
#grafana_conf_overrides: {}
#prometheus_container_image: "docker.io/prom/prometheus:v2.7.2"
#prometheus_container_image: "quay.io/prometheus/prometheus:v3.6.0"
#prometheus_container_cpu_period: 100000
#prometheus_container_cpu_cores: 2
# container_memory is in GB
Expand All @@ -637,7 +637,7 @@ dummy:
# Uncomment out this variable if you need to customize the retention period for prometheus storage.
# set it to '30d' if you want to retain 30 days of data.
# prometheus_storage_tsdb_retention_time: 15d
#alertmanager_container_image: "docker.io/prom/alertmanager:v0.16.2"
#alertmanager_container_image: "quay.io/prometheus/alertmanager:v0.28.1"
#alertmanager_container_cpu_period: 100000
#alertmanager_container_cpu_cores: 2
# container_memory is in GB
Expand Down
176 changes: 125 additions & 51 deletions infrastructure-playbooks/rolling_update.yml
Original file line number Diff line number Diff line change
Expand Up @@ -579,6 +579,7 @@
environment:
CEPH_VOLUME_DEBUG: "{{ ceph_volume_debug }}"
when: not containerized_deployment | bool
and ceph_stable_release == 'nautilus'

- name: Activate scanned ceph-disk osds and migrate to ceph-volume if deploying nautilus
ceph_volume_simple_activate:
Expand All @@ -587,6 +588,7 @@
environment:
CEPH_VOLUME_DEBUG: "{{ ceph_volume_debug }}"
when: not containerized_deployment | bool
and ceph_stable_release == 'nautilus'

- name: Waiting for clean pgs...
ansible.builtin.command: "{{ container_exec_cmd_update_osd | default('') }} ceph --cluster {{ cluster }} pg stat --format json"
Expand Down Expand Up @@ -655,6 +657,18 @@
ansible.builtin.import_role:
name: ceph-facts

- name: Disable allow_standby_replay during rolling mds upgrade
ceph_fs:
name: "{{ cephfs }}"
cluster: "{{ cluster }}"
data: "{{ cephfs_data_pool.name }}"
metadata: "{{ cephfs_metadata_pool.name }}"
allow_standby_replay: false
environment:
CEPH_CONTAINER_IMAGE: "{{ ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment | bool else None }}"
CEPH_CONTAINER_BINARY: "{{ container_binary }}"
when: _cephfs_info.value | default(false) | bool

- name: Deactivate all mds rank > 0 if any
when: groups.get(mds_group_name, []) | length > 1
block:
Expand All @@ -674,10 +688,11 @@
name: "{{ cephfs }}"
cluster: "{{ cluster }}"
state: info
info_param: mds_in_ranks
register: wait_rank_zero
retries: 720
delay: 5
until: (wait_rank_zero.stdout | from_json).mdsmap.in | length == 1 and (wait_rank_zero.stdout | from_json).mdsmap.in[0] == 0
until: (wait_rank_zero.value | default([]) | length) == 1 and (wait_rank_zero.value | default([]) | first) == 0
environment:
CEPH_CONTAINER_IMAGE: "{{ ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment | bool else None }}"
CEPH_CONTAINER_BINARY: "{{ container_binary }}"
Expand Down Expand Up @@ -706,33 +721,6 @@
ansible_port: "{{ hostvars[item]['ansible_port'] | default(omit) }}"
with_items: "{{ groups[mds_group_name] | difference(mds_active_host) }}"

- name: Stop standby ceph mds
ansible.builtin.systemd:
name: "ceph-mds@{{ hostvars[item]['ansible_facts']['hostname'] }}"
state: stopped
enabled: false
delegate_to: "{{ item }}"
with_items: "{{ groups['standby_mdss'] }}"
when: groups['standby_mdss'] | default([]) | length > 0

# dedicated task for masking systemd unit
# somehow, having a single task doesn't work in containerized context
- name: Mask systemd units for standby ceph mds
ansible.builtin.systemd:
name: "ceph-mds@{{ hostvars[item]['ansible_facts']['hostname'] }}"
masked: true
delegate_to: "{{ item }}"
with_items: "{{ groups['standby_mdss'] }}"
when: groups['standby_mdss'] | default([]) | length > 0

- name: Wait until all standbys mds are stopped
ansible.builtin.command: "{{ container_exec_cmd | default('') }} ceph --cluster {{ cluster }} fs dump -f json"
changed_when: false
register: wait_standbys_down
retries: 300
delay: 5
until: (wait_standbys_down.stdout | from_json).standbys | length == 0

- name: Create active_mdss group
ansible.builtin.add_host:
name: "{{ mds_active_host[0] if mds_active_host is defined else groups.get(mds_group_name)[0] }}"
Expand All @@ -741,10 +729,10 @@
ansible_port: "{{ hostvars[mds_active_host[0] if mds_active_host is defined else groups.get(mds_group_name)[0]]['ansible_port'] | default(omit) }}"


- name: Upgrade active mds
- name: Upgrade standby ceph mdss cluster
vars:
upgrade_ceph_packages: true
hosts: active_mdss
hosts: standby_mdss
tags: mdss
become: true
gather_facts: false
Expand All @@ -760,10 +748,18 @@
- name: Prevent restart from the packaging
ansible.builtin.systemd:
name: ceph-mds@{{ ansible_facts['hostname'] }}
state: stopped
enabled: false
masked: true
when: not containerized_deployment | bool

- name: Stop standby mds before package upgrade (containerized)
ansible.builtin.systemd:
name: ceph-mds@{{ ansible_facts['hostname'] }}
state: stopped
changed_when: false
when: containerized_deployment | bool

- name: Import ceph-handler role
ansible.builtin.import_role:
name: ceph-handler
Expand All @@ -786,23 +782,10 @@
ansible.builtin.import_role:
name: ceph-mds

- name: Restart ceph mds
ansible.builtin.systemd:
name: ceph-mds@{{ ansible_facts['hostname'] }}
state: restarted
enabled: true
masked: false
when: not containerized_deployment | bool

- name: Restart active mds
ansible.builtin.command: "{{ container_binary }} stop ceph-mds-{{ ansible_facts['hostname'] }}"
changed_when: false
when: containerized_deployment | bool

- name: Upgrade standbys ceph mdss cluster
- name: Upgrade active mds
vars:
upgrade_ceph_packages: true
hosts: standby_mdss
hosts: active_mdss
tags: mdss
become: true
gather_facts: false
Expand All @@ -816,12 +799,91 @@
ansible.builtin.import_role:
name: ceph-facts

- name: Get CephFS status before deciding failover wait
ansible.builtin.command: "{{ container_exec_cmd | default('') }} ceph --cluster {{ cluster }} fs status {{ cephfs }} -f json"
register: _mds_fs_status_initial
changed_when: false
when: groups.get('standby_mdss', []) | length > 0

- name: Set_fact mds_failover_expected
ansible.builtin.set_fact:
mds_failover_expected: >-
{{
(
((_mds_fs_status_initial.stdout | default('{}') | from_json).standbys | default([]) | length) > 0
) or (
((_mds_fs_status_initial.stdout | default('{}') | from_json).mdsmap | default([])
| selectattr('state', 'in', ['standby', 'up:standby']) | list | length) > 0
)
}}
when: groups.get('standby_mdss', []) | length > 0

- name: Wait for CephFS standby before forcing failover
ansible.builtin.command: "{{ container_exec_cmd | default('') }} ceph --cluster {{ cluster }} fs status {{ cephfs }} -f json"
register: _mds_fs_status_before_failover
changed_when: false
retries: 60
delay: 5
until: >-
(
((_mds_fs_status_before_failover.stdout | from_json).standbys | default([]) | length) > 0
) or (
((_mds_fs_status_before_failover.stdout | from_json).mdsmap | default([])
| selectattr('state', 'in', ['standby', 'up:standby']) | list | length) > 0
)
when:
- groups.get('standby_mdss', []) | length > 0
- mds_failover_expected | default(false) | bool

- name: Set_fact mds_failover_ready
ansible.builtin.set_fact:
mds_failover_ready: "{{ mds_failover_expected | default(false) | bool }}"
when: groups.get('standby_mdss', []) | length > 0

- name: Skip forced failover when no standby daemon was initially available for this filesystem
ansible.builtin.debug:
msg: "No standby MDS detected for filesystem {{ cephfs }} before upgrade; skipping forced failover and continuing with in-place upgrade path."
when:
- groups.get('standby_mdss', []) | length > 0
- not (mds_failover_expected | default(false) | bool)

- name: Trigger MDS failover before upgrading active daemon
ansible.builtin.command: "{{ container_exec_cmd | default('') }} ceph --cluster {{ cluster }} mds fail {{ ansible_facts['hostname'] }}"
changed_when: false
failed_when: false
when:
- groups.get('standby_mdss', []) | length > 0
- mds_failover_ready | default(false) | bool

- name: Stop active mds immediately after failover (non-containerized)
ansible.builtin.systemd:
name: ceph-mds@{{ ansible_facts['hostname'] }}
state: stopped
enabled: false
masked: true
when:
- not containerized_deployment | bool
- groups.get('standby_mdss', []) | length > 0
- mds_failover_ready | default(false) | bool

- name: Stop active mds immediately after failover (containerized)
ansible.builtin.systemd:
name: ceph-mds@{{ ansible_facts['hostname'] }}
state: stopped
changed_when: false
when:
- containerized_deployment | bool
- groups.get('standby_mdss', []) | length > 0
- mds_failover_ready | default(false) | bool

- name: Prevent restarts from the packaging
ansible.builtin.systemd:
name: ceph-mds@{{ ansible_facts['hostname'] }}
enabled: false
masked: true
when: not containerized_deployment | bool
when:
- not containerized_deployment | bool
- groups.get('standby_mdss', []) | length == 0 or not (mds_failover_ready | default(false) | bool)

- name: Import ceph-handler role
ansible.builtin.import_role:
Expand Down Expand Up @@ -856,7 +918,7 @@
environment:
CEPH_CONTAINER_IMAGE: "{{ ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment | bool else None }}"
CEPH_CONTAINER_BINARY: "{{ container_binary }}"
when: inventory_hostname == groups['standby_mdss'] | last
when: inventory_hostname == groups['active_mdss'] | last


- name: Upgrade ceph rgws cluster
Expand Down Expand Up @@ -1161,17 +1223,17 @@
name: ceph-facts
tasks_from: container_binary.yml

- name: Container | disallow pre-squid OSDs and enable all new squid-only functionality
ansible.builtin.command: "{{ container_binary }} exec ceph-mon-{{ hostvars[groups[mon_group_name][0]]['ansible_facts']['hostname'] }} ceph --cluster {{ cluster }} osd require-osd-release squid"
- name: Container | disallow pre-{{ ceph_stable_release }} OSDs and enable new release functionality
ansible.builtin.command: "{{ container_binary }} exec ceph-mon-{{ hostvars[groups[mon_group_name][0]]['ansible_facts']['hostname'] }} ceph --cluster {{ cluster }} osd require-osd-release {{ ceph_stable_release }}"
delegate_to: "{{ groups[mon_group_name][0] }}"
run_once: true
changed_when: false
when:
- containerized_deployment | bool
- groups.get(mon_group_name, []) | length > 0

- name: Non container | disallow pre-squid OSDs and enable all new squid-only functionality
ansible.builtin.command: "ceph --cluster {{ cluster }} osd require-osd-release squid"
- name: Non container | disallow pre-{{ ceph_stable_release }} OSDs and enable new release functionality
ansible.builtin.command: "ceph --cluster {{ cluster }} osd require-osd-release {{ ceph_stable_release }}"
delegate_to: "{{ groups[mon_group_name][0] }}"
run_once: true
changed_when: false
Expand Down Expand Up @@ -1348,12 +1410,24 @@

- name: Show ceph status
ansible.builtin.command: "{{ container_exec_cmd_status | default('') }} ceph --cluster {{ cluster }} -s"
register: _rolling_update_ceph_status
changed_when: false
run_once: true
delegate_to: "{{ groups[mon_group_name][0] }}"

- name: Print ceph status
ansible.builtin.debug:
msg: "{{ _rolling_update_ceph_status.stdout }}"
run_once: true

- name: Show all daemons version
ansible.builtin.command: "{{ container_exec_cmd_status | default('') }} ceph --cluster {{ cluster }} versions"
register: _rolling_update_ceph_versions
run_once: true
delegate_to: "{{ groups[mon_group_name][0] }}"
changed_when: false

- name: Print all daemons version
ansible.builtin.debug:
msg: "{{ _rolling_update_ceph_versions.stdout }}"
run_once: true
Loading