@@ -1026,6 +1026,9 @@ func (dn *Daemon) update(oldConfig, newConfig *mcfgv1.MachineConfig, skipCertifi
10261026
10271027 var nodeDisruptionActions []opv1.NodeDisruptionPolicyStatusAction
10281028 var actions []string
1029+ // Check for forcefile before calculatePostConfigChange* functions delete it.
1030+ // This is needed for updateFiles to know whether to write all units (OCPBUGS-74692).
1031+ forceFilePresent := forceFileExists ()
10291032 // Node Disruption Policies cannot be used during firstboot as API is not accessible.
10301033 if ! firstBoot {
10311034 nodeDisruptionActions , err = dn .calculatePostConfigChangeNodeDisruptionAction (diff , diffFileSet , allChangedUnitNames )
@@ -1134,13 +1137,13 @@ func (dn *Daemon) update(oldConfig, newConfig *mcfgv1.MachineConfig, skipCertifi
11341137 }
11351138
11361139 // update files on disk that need updating
1137- if err := dn .updateFiles (oldIgnConfig , newIgnConfig , addedOrChangedUnits , skipCertificateWrite ); err != nil {
1140+ if err := dn .updateFiles (oldIgnConfig , newIgnConfig , addedOrChangedUnits , skipCertificateWrite , forceFilePresent ); err != nil {
11381141 return err
11391142 }
11401143
11411144 defer func () {
11421145 if retErr != nil {
1143- if err := dn .updateFiles (newIgnConfig , oldIgnConfig , addedOrChangedUnits , skipCertificateWrite ); err != nil {
1146+ if err := dn .updateFiles (newIgnConfig , oldIgnConfig , addedOrChangedUnits , skipCertificateWrite , false ); err != nil {
11441147 errs := kubeErrs .NewAggregate ([]error {err , retErr })
11451148 retErr = fmt .Errorf ("error rolling back files writes: %w" , errs )
11461149 return
@@ -1278,15 +1281,18 @@ func (dn *Daemon) updateHypershift(oldConfig, newConfig *mcfgv1.MachineConfig, d
12781281 unitDiff := ctrlcommon .GetChangedConfigUnitsByType (& oldIgnConfig , & newIgnConfig )
12791282 addedOrChangedUnits := slices .Concat (unitDiff .Added , unitDiff .Updated )
12801283
1284+ // Check for forcefile to support config drift recovery (OCPBUGS-74692)
1285+ forceFilePresent := forceFileExists ()
1286+
12811287 // update files on disk that need updating
12821288 // We should't skip the certificate write in HyperShift since it does not run the extra daemon process
1283- if err := dn .updateFiles (oldIgnConfig , newIgnConfig , addedOrChangedUnits , false ); err != nil {
1289+ if err := dn .updateFiles (oldIgnConfig , newIgnConfig , addedOrChangedUnits , false , forceFilePresent ); err != nil {
12841290 return err
12851291 }
12861292
12871293 defer func () {
12881294 if retErr != nil {
1289- if err := dn .updateFiles (newIgnConfig , oldIgnConfig , addedOrChangedUnits , false ); err != nil {
1295+ if err := dn .updateFiles (newIgnConfig , oldIgnConfig , addedOrChangedUnits , false , false ); err != nil {
12901296 errs := kubeErrs .NewAggregate ([]error {err , retErr })
12911297 retErr = fmt .Errorf ("error rolling back files writes: %w" , errs )
12921298 return
@@ -1811,12 +1817,26 @@ func (dn *CoreOSDaemon) getKernelPackagesForTargetRelease() (releaseKernelPackag
18111817// whatever has been written is picked up by the appropriate daemons, if
18121818// required. in particular, a daemon-reload and restart for any unit files
18131819// touched.
1814- func (dn * Daemon ) updateFiles (oldIgnConfig , newIgnConfig ign3types.Config , addedOrChangedUnits []ign3types.Unit , skipCertificateWrite bool ) error {
1820+ func (dn * Daemon ) updateFiles (oldIgnConfig , newIgnConfig ign3types.Config , addedOrChangedUnits []ign3types.Unit , skipCertificateWrite , forceFilePresent bool ) error {
18151821 klog .Info ("Updating files" )
18161822 if err := dn .writeFiles (newIgnConfig .Storage .Files , skipCertificateWrite ); err != nil {
18171823 return err
18181824 }
1819- if err := dn .writeUnits (addedOrChangedUnits ); err != nil {
1825+
1826+ // With OCPBUGS-58023, we updated this flow to only write units that were either added or
1827+ // updated. As can be seen in OCPBUGS-74692, this impacted the traditional method to recover
1828+ // from config drifts with systemd units. It makes the `touch /run/machine-config-daemon-force`
1829+ // command useless since the new flow does not rewrite all files, only the ones that have been
1830+ // added or changed with the latest MC. To keep the fix for OCPBUGS-58023 and allow continue
1831+ // supporting the traditional config drift recovery for systemd units, all units should be
1832+ // written when a forcefile exists.
1833+ unitsToWrite := addedOrChangedUnits
1834+ if forceFilePresent {
1835+ klog .Info ("Forcefile exists, writing all units" )
1836+ unitsToWrite = newIgnConfig .Systemd .Units
1837+ }
1838+
1839+ if err := dn .writeUnits (unitsToWrite ); err != nil {
18201840 return err
18211841 }
18221842 return dn .deleteStaleData (oldIgnConfig , newIgnConfig )
0 commit comments