diff --git a/group_vars/all.yml.sample b/group_vars/all.yml.sample index 54f7f08f87..dcb55d95a3 100644 --- a/group_vars/all.yml.sample +++ b/group_vars/all.yml.sample @@ -155,7 +155,7 @@ dummy: # #ceph_mirror: https://download.ceph.com #ceph_stable_key: https://download.ceph.com/keys/release.asc -#ceph_stable_release: squid +#ceph_stable_release: tentacle #ceph_stable_repo: "{{ ceph_mirror }}/debian-{{ ceph_stable_release }}" #nfs_ganesha_stable: true # use stable repos for nfs-ganesha @@ -584,7 +584,7 @@ dummy: #dashboard_disabled_features: [] #prometheus_frontend_vip: '' #alertmanager_frontend_vip: '' -#node_exporter_container_image: "docker.io/prom/node-exporter:v0.17.0" +#node_exporter_container_image: "quay.io/prometheus/node-exporter:v1.9.1" #node_exporter_port: 9100 #grafana_admin_user: admin # This variable must be set with a strong custom password when dashboard_enabled is True @@ -594,7 +594,7 @@ dummy: #grafana_key: '' # When using https, please fill with a hostname for which grafana_crt is valid. #grafana_server_fqdn: '' -#grafana_container_image: "docker.io/grafana/grafana:6.7.4" +#grafana_container_image: "quay.io/ceph/grafana:12.3.1" #grafana_container_cpu_period: 100000 #grafana_container_cpu_cores: 2 # container_memory is in GB @@ -624,7 +624,7 @@ dummy: #grafana_port: 3000 #grafana_network: "{{ public_network }}" #grafana_conf_overrides: {} -#prometheus_container_image: "docker.io/prom/prometheus:v2.7.2" +#prometheus_container_image: "quay.io/prometheus/prometheus:v3.6.0" #prometheus_container_cpu_period: 100000 #prometheus_container_cpu_cores: 2 # container_memory is in GB @@ -637,7 +637,7 @@ dummy: # Uncomment out this variable if you need to customize the retention period for prometheus storage. # set it to '30d' if you want to retain 30 days of data. # prometheus_storage_tsdb_retention_time: 15d -#alertmanager_container_image: "docker.io/prom/alertmanager:v0.16.2" +#alertmanager_container_image: "quay.io/prometheus/alertmanager:v0.28.1" #alertmanager_container_cpu_period: 100000 #alertmanager_container_cpu_cores: 2 # container_memory is in GB diff --git a/infrastructure-playbooks/rolling_update.yml b/infrastructure-playbooks/rolling_update.yml index 839609a728..94a7fe626e 100644 --- a/infrastructure-playbooks/rolling_update.yml +++ b/infrastructure-playbooks/rolling_update.yml @@ -579,6 +579,7 @@ environment: CEPH_VOLUME_DEBUG: "{{ ceph_volume_debug }}" when: not containerized_deployment | bool + and ceph_stable_release == 'nautilus' - name: Activate scanned ceph-disk osds and migrate to ceph-volume if deploying nautilus ceph_volume_simple_activate: @@ -587,6 +588,7 @@ environment: CEPH_VOLUME_DEBUG: "{{ ceph_volume_debug }}" when: not containerized_deployment | bool + and ceph_stable_release == 'nautilus' - name: Waiting for clean pgs... ansible.builtin.command: "{{ container_exec_cmd_update_osd | default('') }} ceph --cluster {{ cluster }} pg stat --format json" @@ -655,6 +657,18 @@ ansible.builtin.import_role: name: ceph-facts + - name: Disable allow_standby_replay during rolling mds upgrade + ceph_fs: + name: "{{ cephfs }}" + cluster: "{{ cluster }}" + data: "{{ cephfs_data_pool.name }}" + metadata: "{{ cephfs_metadata_pool.name }}" + allow_standby_replay: false + environment: + CEPH_CONTAINER_IMAGE: "{{ ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment | bool else None }}" + CEPH_CONTAINER_BINARY: "{{ container_binary }}" + when: _cephfs_info.value | default(false) | bool + - name: Deactivate all mds rank > 0 if any when: groups.get(mds_group_name, []) | length > 1 block: @@ -674,10 +688,11 @@ name: "{{ cephfs }}" cluster: "{{ cluster }}" state: info + info_param: mds_in_ranks register: wait_rank_zero retries: 720 delay: 5 - until: (wait_rank_zero.stdout | from_json).mdsmap.in | length == 1 and (wait_rank_zero.stdout | from_json).mdsmap.in[0] == 0 + until: (wait_rank_zero.value | default([]) | length) == 1 and (wait_rank_zero.value | default([]) | first) == 0 environment: CEPH_CONTAINER_IMAGE: "{{ ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment | bool else None }}" CEPH_CONTAINER_BINARY: "{{ container_binary }}" @@ -706,33 +721,6 @@ ansible_port: "{{ hostvars[item]['ansible_port'] | default(omit) }}" with_items: "{{ groups[mds_group_name] | difference(mds_active_host) }}" - - name: Stop standby ceph mds - ansible.builtin.systemd: - name: "ceph-mds@{{ hostvars[item]['ansible_facts']['hostname'] }}" - state: stopped - enabled: false - delegate_to: "{{ item }}" - with_items: "{{ groups['standby_mdss'] }}" - when: groups['standby_mdss'] | default([]) | length > 0 - - # dedicated task for masking systemd unit - # somehow, having a single task doesn't work in containerized context - - name: Mask systemd units for standby ceph mds - ansible.builtin.systemd: - name: "ceph-mds@{{ hostvars[item]['ansible_facts']['hostname'] }}" - masked: true - delegate_to: "{{ item }}" - with_items: "{{ groups['standby_mdss'] }}" - when: groups['standby_mdss'] | default([]) | length > 0 - - - name: Wait until all standbys mds are stopped - ansible.builtin.command: "{{ container_exec_cmd | default('') }} ceph --cluster {{ cluster }} fs dump -f json" - changed_when: false - register: wait_standbys_down - retries: 300 - delay: 5 - until: (wait_standbys_down.stdout | from_json).standbys | length == 0 - - name: Create active_mdss group ansible.builtin.add_host: name: "{{ mds_active_host[0] if mds_active_host is defined else groups.get(mds_group_name)[0] }}" @@ -741,10 +729,10 @@ ansible_port: "{{ hostvars[mds_active_host[0] if mds_active_host is defined else groups.get(mds_group_name)[0]]['ansible_port'] | default(omit) }}" -- name: Upgrade active mds +- name: Upgrade standby ceph mdss cluster vars: upgrade_ceph_packages: true - hosts: active_mdss + hosts: standby_mdss tags: mdss become: true gather_facts: false @@ -760,10 +748,18 @@ - name: Prevent restart from the packaging ansible.builtin.systemd: name: ceph-mds@{{ ansible_facts['hostname'] }} + state: stopped enabled: false masked: true when: not containerized_deployment | bool + - name: Stop standby mds before package upgrade (containerized) + ansible.builtin.systemd: + name: ceph-mds@{{ ansible_facts['hostname'] }} + state: stopped + changed_when: false + when: containerized_deployment | bool + - name: Import ceph-handler role ansible.builtin.import_role: name: ceph-handler @@ -786,23 +782,10 @@ ansible.builtin.import_role: name: ceph-mds - - name: Restart ceph mds - ansible.builtin.systemd: - name: ceph-mds@{{ ansible_facts['hostname'] }} - state: restarted - enabled: true - masked: false - when: not containerized_deployment | bool - - - name: Restart active mds - ansible.builtin.command: "{{ container_binary }} stop ceph-mds-{{ ansible_facts['hostname'] }}" - changed_when: false - when: containerized_deployment | bool - -- name: Upgrade standbys ceph mdss cluster +- name: Upgrade active mds vars: upgrade_ceph_packages: true - hosts: standby_mdss + hosts: active_mdss tags: mdss become: true gather_facts: false @@ -816,12 +799,91 @@ ansible.builtin.import_role: name: ceph-facts + - name: Get CephFS status before deciding failover wait + ansible.builtin.command: "{{ container_exec_cmd | default('') }} ceph --cluster {{ cluster }} fs status {{ cephfs }} -f json" + register: _mds_fs_status_initial + changed_when: false + when: groups.get('standby_mdss', []) | length > 0 + + - name: Set_fact mds_failover_expected + ansible.builtin.set_fact: + mds_failover_expected: >- + {{ + ( + ((_mds_fs_status_initial.stdout | default('{}') | from_json).standbys | default([]) | length) > 0 + ) or ( + ((_mds_fs_status_initial.stdout | default('{}') | from_json).mdsmap | default([]) + | selectattr('state', 'in', ['standby', 'up:standby']) | list | length) > 0 + ) + }} + when: groups.get('standby_mdss', []) | length > 0 + + - name: Wait for CephFS standby before forcing failover + ansible.builtin.command: "{{ container_exec_cmd | default('') }} ceph --cluster {{ cluster }} fs status {{ cephfs }} -f json" + register: _mds_fs_status_before_failover + changed_when: false + retries: 60 + delay: 5 + until: >- + ( + ((_mds_fs_status_before_failover.stdout | from_json).standbys | default([]) | length) > 0 + ) or ( + ((_mds_fs_status_before_failover.stdout | from_json).mdsmap | default([]) + | selectattr('state', 'in', ['standby', 'up:standby']) | list | length) > 0 + ) + when: + - groups.get('standby_mdss', []) | length > 0 + - mds_failover_expected | default(false) | bool + + - name: Set_fact mds_failover_ready + ansible.builtin.set_fact: + mds_failover_ready: "{{ mds_failover_expected | default(false) | bool }}" + when: groups.get('standby_mdss', []) | length > 0 + + - name: Skip forced failover when no standby daemon was initially available for this filesystem + ansible.builtin.debug: + msg: "No standby MDS detected for filesystem {{ cephfs }} before upgrade; skipping forced failover and continuing with in-place upgrade path." + when: + - groups.get('standby_mdss', []) | length > 0 + - not (mds_failover_expected | default(false) | bool) + + - name: Trigger MDS failover before upgrading active daemon + ansible.builtin.command: "{{ container_exec_cmd | default('') }} ceph --cluster {{ cluster }} mds fail {{ ansible_facts['hostname'] }}" + changed_when: false + failed_when: false + when: + - groups.get('standby_mdss', []) | length > 0 + - mds_failover_ready | default(false) | bool + + - name: Stop active mds immediately after failover (non-containerized) + ansible.builtin.systemd: + name: ceph-mds@{{ ansible_facts['hostname'] }} + state: stopped + enabled: false + masked: true + when: + - not containerized_deployment | bool + - groups.get('standby_mdss', []) | length > 0 + - mds_failover_ready | default(false) | bool + + - name: Stop active mds immediately after failover (containerized) + ansible.builtin.systemd: + name: ceph-mds@{{ ansible_facts['hostname'] }} + state: stopped + changed_when: false + when: + - containerized_deployment | bool + - groups.get('standby_mdss', []) | length > 0 + - mds_failover_ready | default(false) | bool + - name: Prevent restarts from the packaging ansible.builtin.systemd: name: ceph-mds@{{ ansible_facts['hostname'] }} enabled: false masked: true - when: not containerized_deployment | bool + when: + - not containerized_deployment | bool + - groups.get('standby_mdss', []) | length == 0 or not (mds_failover_ready | default(false) | bool) - name: Import ceph-handler role ansible.builtin.import_role: @@ -856,7 +918,7 @@ environment: CEPH_CONTAINER_IMAGE: "{{ ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment | bool else None }}" CEPH_CONTAINER_BINARY: "{{ container_binary }}" - when: inventory_hostname == groups['standby_mdss'] | last + when: inventory_hostname == groups['active_mdss'] | last - name: Upgrade ceph rgws cluster @@ -1161,8 +1223,8 @@ name: ceph-facts tasks_from: container_binary.yml - - name: Container | disallow pre-squid OSDs and enable all new squid-only functionality - ansible.builtin.command: "{{ container_binary }} exec ceph-mon-{{ hostvars[groups[mon_group_name][0]]['ansible_facts']['hostname'] }} ceph --cluster {{ cluster }} osd require-osd-release squid" + - name: Container | disallow pre-{{ ceph_stable_release }} OSDs and enable new release functionality + ansible.builtin.command: "{{ container_binary }} exec ceph-mon-{{ hostvars[groups[mon_group_name][0]]['ansible_facts']['hostname'] }} ceph --cluster {{ cluster }} osd require-osd-release {{ ceph_stable_release }}" delegate_to: "{{ groups[mon_group_name][0] }}" run_once: true changed_when: false @@ -1170,8 +1232,8 @@ - containerized_deployment | bool - groups.get(mon_group_name, []) | length > 0 - - name: Non container | disallow pre-squid OSDs and enable all new squid-only functionality - ansible.builtin.command: "ceph --cluster {{ cluster }} osd require-osd-release squid" + - name: Non container | disallow pre-{{ ceph_stable_release }} OSDs and enable new release functionality + ansible.builtin.command: "ceph --cluster {{ cluster }} osd require-osd-release {{ ceph_stable_release }}" delegate_to: "{{ groups[mon_group_name][0] }}" run_once: true changed_when: false @@ -1348,12 +1410,24 @@ - name: Show ceph status ansible.builtin.command: "{{ container_exec_cmd_status | default('') }} ceph --cluster {{ cluster }} -s" + register: _rolling_update_ceph_status changed_when: false run_once: true delegate_to: "{{ groups[mon_group_name][0] }}" + - name: Print ceph status + ansible.builtin.debug: + msg: "{{ _rolling_update_ceph_status.stdout }}" + run_once: true + - name: Show all daemons version ansible.builtin.command: "{{ container_exec_cmd_status | default('') }} ceph --cluster {{ cluster }} versions" + register: _rolling_update_ceph_versions run_once: true delegate_to: "{{ groups[mon_group_name][0] }}" changed_when: false + + - name: Print all daemons version + ansible.builtin.debug: + msg: "{{ _rolling_update_ceph_versions.stdout }}" + run_once: true diff --git a/library/ceph_fs.py b/library/ceph_fs.py index 10db2b161b..1cf74d8016 100644 --- a/library/ceph_fs.py +++ b/library/ceph_fs.py @@ -79,6 +79,19 @@ description: - name of the max_mds attribute. required: false + max_file_size: + description: + - Maximum allowed file size for the filesystem. + required: false + allow_standby_replay: + description: + - Whether this CephFS allows standby-replay daemons. + required: false + info_param: + description: + - When C(state=info), return only a single parsed field in C(value). + required: false + choices: ['allow_standby_replay', 'max_mds', 'max_file_size', 'mds_in_count', 'mds_in_ranks'] author: @@ -181,16 +194,14 @@ def fail_fs(module, container_image=None): return cmd -def set_fs(module, container_image=None): +def set_fs(module, param, value, container_image=None): ''' Set parameter to a fs ''' cluster = module.params.get('cluster') name = module.params.get('name') - max_mds = module.params.get('max_mds') - - args = ['set', name, 'max_mds', str(max_mds)] + args = ['set', name, str(param), str(value)] cmd = generate_cmd(sub_cmd=['fs'], args=args, @@ -200,6 +211,32 @@ def set_fs(module, container_image=None): return cmd +def get_allow_standby_replay(fs): + ''' + Return allow_standby_replay from fs json payload + ''' + + mdsmap = fs.get('mdsmap', {}) + flags_state = mdsmap.get('flags_state', {}) + return bool(flags_state.get('allow_standby_replay', mdsmap.get('allow_standby_replay', False))) + + +def parse_fs_info(out): + ''' + Parse fs info output and return useful fields for playbooks. + ''' + + fs = json.loads(out) + mdsmap = fs.get('mdsmap', {}) + return { + 'allow_standby_replay_current': get_allow_standby_replay(fs), + 'max_mds_current': mdsmap.get('max_mds'), + 'max_file_size_current': mdsmap.get('max_file_size'), + 'mds_in_ranks_current': mdsmap.get('in', []), + 'mds_in_count_current': len(mdsmap.get('in', [])), + } + + def run_module(): module_args = dict( cluster=dict(type='str', required=False, default='ceph'), @@ -208,6 +245,9 @@ def run_module(): data=dict(type='str', required=False), metadata=dict(type='str', required=False), max_mds=dict(type='int', required=False), + max_file_size=dict(type='int', required=False), + allow_standby_replay=dict(type='bool', required=False), + info_param=dict(type='str', required=False, choices=['allow_standby_replay', 'max_mds', 'max_file_size', 'mds_in_count', 'mds_in_ranks']), ) module = AnsibleModule( @@ -220,6 +260,9 @@ def run_module(): name = module.params.get('name') state = module.params.get('state') max_mds = module.params.get('max_mds') + max_file_size = module.params.get('max_file_size') + allow_standby_replay = module.params.get('allow_standby_replay') + info_param = module.params.get('info_param') if module.check_mode: module.exit_json( @@ -243,13 +286,25 @@ def run_module(): if rc == 0: fs = json.loads(out) if max_mds and fs["mdsmap"]["max_mds"] != max_mds: - rc, cmd, out, err = exec_command(module, set_fs(module, container_image=container_image)) # noqa: E501 + rc, cmd, out, err = exec_command(module, set_fs(module, 'max_mds', max_mds, container_image=container_image)) # noqa: E501 + if rc == 0: + changed = True + if max_file_size is not None and fs["mdsmap"]["max_file_size"] != max_file_size: + rc, cmd, out, err = exec_command(module, set_fs(module, 'max_file_size', max_file_size, container_image=container_image)) # noqa: E501 + if rc == 0: + changed = True + if allow_standby_replay is not None and get_allow_standby_replay(fs) != allow_standby_replay: + rc, cmd, out, err = exec_command(module, set_fs(module, 'allow_standby_replay', 'true' if allow_standby_replay else 'false', container_image=container_image)) # noqa: E501 if rc == 0: changed = True else: rc, cmd, out, err = exec_command(module, create_fs(module, container_image=container_image)) # noqa: E501 if max_mds and max_mds > 1: - exec_command(module, set_fs(module, container_image=container_image)) # noqa: E501 + exec_command(module, set_fs(module, 'max_mds', max_mds, container_image=container_image)) # noqa: E501 + if max_file_size is not None: + exec_command(module, set_fs(module, 'max_file_size', max_file_size, container_image=container_image)) # noqa: E501 + if allow_standby_replay: + exec_command(module, set_fs(module, 'allow_standby_replay', 'true', container_image=container_image)) # noqa: E501 if rc == 0: changed = True @@ -266,6 +321,38 @@ def run_module(): elif state == "info": rc, cmd, out, err = exec_command(module, get_fs(module, container_image=container_image)) # noqa: E501 + if rc == 0: + endd = datetime.datetime.now() + delta = endd - startd + parsed = parse_fs_info(out) + if info_param == 'allow_standby_replay': + value = parsed['allow_standby_replay_current'] + elif info_param == 'max_mds': + value = parsed['max_mds_current'] + elif info_param == 'max_file_size': + value = parsed['max_file_size_current'] + elif info_param == 'mds_in_count': + value = parsed['mds_in_count_current'] + elif info_param == 'mds_in_ranks': + value = parsed['mds_in_ranks_current'] + else: + value = None + module.exit_json( + cmd=cmd, + start=str(startd), + end=str(endd), + delta=str(delta), + rc=rc, + stdout=out.rstrip("\r\n"), + stderr=err.rstrip("\r\n"), + changed=changed, + allow_standby_replay_current=parsed['allow_standby_replay_current'], + max_mds_current=parsed['max_mds_current'], + max_file_size_current=parsed['max_file_size_current'], + mds_in_count_current=parsed['mds_in_count_current'], + mds_in_ranks_current=parsed['mds_in_ranks_current'], + value=value, + ) exit_module(module=module, out=out, rc=rc, cmd=cmd, err=err, startd=startd, changed=changed) # noqa: E501 diff --git a/roles/ceph-container-engine/vars/AlmaLinux-9.yml b/roles/ceph-container-engine/vars/AlmaLinux-9.yml new file mode 100644 index 0000000000..a46ed44fbc --- /dev/null +++ b/roles/ceph-container-engine/vars/AlmaLinux-9.yml @@ -0,0 +1,3 @@ +--- +container_package_name: podman +container_service_name: podman diff --git a/roles/ceph-container-engine/vars/RedHat-9.yml b/roles/ceph-container-engine/vars/RedHat-9.yml new file mode 100644 index 0000000000..a46ed44fbc --- /dev/null +++ b/roles/ceph-container-engine/vars/RedHat-9.yml @@ -0,0 +1,3 @@ +--- +container_package_name: podman +container_service_name: podman diff --git a/roles/ceph-defaults/defaults/main.yml b/roles/ceph-defaults/defaults/main.yml index 1e004fdd97..f5e8bc12b2 100644 --- a/roles/ceph-defaults/defaults/main.yml +++ b/roles/ceph-defaults/defaults/main.yml @@ -25,6 +25,7 @@ ceph_release_num: quincy: 17 reef: 18 squid: 19 + tentacle: 20 dev: 99 @@ -596,10 +597,16 @@ grafana_datasource: Dashboard grafana_dashboards_path: "/etc/grafana/dashboards/ceph-dashboard" grafana_dashboard_version: main grafana_dashboard_files: + - ceph-application-overview.json + - ceph-cluster-advanced.json - ceph-cluster.json + - ceph-nvmeof-performance.json + - ceph-nvmeof.json - cephfs-overview.json + - cephfsdashboard.json - host-details.json - hosts-overview.json + - multi-cluster-overview.json - osd-device-details.json - osds-overview.json - pool-detail.json @@ -609,6 +616,8 @@ grafana_dashboard_files: - radosgw-sync-overview.json - rbd-details.json - rbd-overview.json + - rgw-s3-analytics.json + - smb-overview.json grafana_plugins: - vonage-status-panel - grafana-piechart-panel diff --git a/roles/ceph-grafana/templates/datasources-ceph-dashboard.yml.j2 b/roles/ceph-grafana/templates/datasources-ceph-dashboard.yml.j2 index 0ff13ad576..d48ff31777 100644 --- a/roles/ceph-grafana/templates/datasources-ceph-dashboard.yml.j2 +++ b/roles/ceph-grafana/templates/datasources-ceph-dashboard.yml.j2 @@ -9,18 +9,18 @@ deleteDatasources: # what's available in the database datasources: # name of the datasource. Required -- name: '{{ grafana_datasource }}' + - name: '{{ grafana_datasource }}' # datasource type. Required - type: 'prometheus' + type: 'prometheus' # access mode. proxy or direct (Server or Browser in the UI). Required - access: 'proxy' + access: 'proxy' # org id. will default to orgId 1 if not specified - orgId: 1 + orgId: 1 # url - url: 'http://{{ grafana_server_addr }}:{{ prometheus_port }}' + url: 'http://{{ grafana_server_addr }}:{{ prometheus_port }}' # enable/disable basic auth - basicAuth: false + basicAuth: false # mark as default datasource. Max one per org - isDefault: true + isDefault: true # allow users to edit datasources from the UI. - editable: false + editable: false diff --git a/roles/ceph-validate/tasks/check_repository.yml b/roles/ceph-validate/tasks/check_repository.yml index 01067dc441..3f81dc1991 100644 --- a/roles/ceph-validate/tasks/check_repository.yml +++ b/roles/ceph-validate/tasks/check_repository.yml @@ -12,8 +12,8 @@ - name: Validate ceph_repository_community ansible.builtin.fail: - msg: "ceph_stable_release must be 'squid'" + msg: "ceph_stable_release must be 'reef', 'tentacle'" when: - ceph_origin == 'repository' - ceph_repository == 'community' - - ceph_stable_release not in ['squid'] + - ceph_stable_release not in ['reef', 'tentacle']