@@ -1025,6 +1025,9 @@ func (dn *Daemon) update(oldConfig, newConfig *mcfgv1.MachineConfig, skipCertifi
10251025
10261026 var nodeDisruptionActions []opv1.NodeDisruptionPolicyStatusAction
10271027 var actions []string
1028+ // Check for forcefile before calculatePostConfigChange* functions delete it.
1029+ // This is needed for updateFiles to know whether to write all units (OCPBUGS-74692).
1030+ forceFilePresent := forceFileExists ()
10281031 // Node Disruption Policies cannot be used during firstboot as API is not accessible.
10291032 if ! firstBoot {
10301033 nodeDisruptionActions , err = dn .calculatePostConfigChangeNodeDisruptionAction (diff , diffFileSet , allChangedUnitNames )
@@ -1133,13 +1136,13 @@ func (dn *Daemon) update(oldConfig, newConfig *mcfgv1.MachineConfig, skipCertifi
11331136 }
11341137
11351138 // update files on disk that need updating
1136- if err := dn .updateFiles (oldIgnConfig , newIgnConfig , addedOrChangedUnits , skipCertificateWrite ); err != nil {
1139+ if err := dn .updateFiles (oldIgnConfig , newIgnConfig , addedOrChangedUnits , skipCertificateWrite , forceFilePresent ); err != nil {
11371140 return err
11381141 }
11391142
11401143 defer func () {
11411144 if retErr != nil {
1142- if err := dn .updateFiles (newIgnConfig , oldIgnConfig , addedOrChangedUnits , skipCertificateWrite ); err != nil {
1145+ if err := dn .updateFiles (newIgnConfig , oldIgnConfig , addedOrChangedUnits , skipCertificateWrite , false ); err != nil {
11431146 errs := kubeErrs .NewAggregate ([]error {err , retErr })
11441147 retErr = fmt .Errorf ("error rolling back files writes: %w" , errs )
11451148 return
@@ -1277,15 +1280,18 @@ func (dn *Daemon) updateHypershift(oldConfig, newConfig *mcfgv1.MachineConfig, d
12771280 unitDiff := ctrlcommon .GetChangedConfigUnitsByType (& oldIgnConfig , & newIgnConfig )
12781281 addedOrChangedUnits := slices .Concat (unitDiff .Added , unitDiff .Updated )
12791282
1283+ // Check for forcefile to support config drift recovery (OCPBUGS-74692)
1284+ forceFilePresent := forceFileExists ()
1285+
12801286 // update files on disk that need updating
12811287 // We should't skip the certificate write in HyperShift since it does not run the extra daemon process
1282- if err := dn .updateFiles (oldIgnConfig , newIgnConfig , addedOrChangedUnits , false ); err != nil {
1288+ if err := dn .updateFiles (oldIgnConfig , newIgnConfig , addedOrChangedUnits , false , forceFilePresent ); err != nil {
12831289 return err
12841290 }
12851291
12861292 defer func () {
12871293 if retErr != nil {
1288- if err := dn .updateFiles (newIgnConfig , oldIgnConfig , addedOrChangedUnits , false ); err != nil {
1294+ if err := dn .updateFiles (newIgnConfig , oldIgnConfig , addedOrChangedUnits , false , false ); err != nil {
12891295 errs := kubeErrs .NewAggregate ([]error {err , retErr })
12901296 retErr = fmt .Errorf ("error rolling back files writes: %w" , errs )
12911297 return
@@ -1810,12 +1816,26 @@ func (dn *CoreOSDaemon) getKernelPackagesForTargetRelease() (releaseKernelPackag
18101816// whatever has been written is picked up by the appropriate daemons, if
18111817// required. in particular, a daemon-reload and restart for any unit files
18121818// touched.
1813- func (dn * Daemon ) updateFiles (oldIgnConfig , newIgnConfig ign3types.Config , addedOrChangedUnits []ign3types.Unit , skipCertificateWrite bool ) error {
1819+ func (dn * Daemon ) updateFiles (oldIgnConfig , newIgnConfig ign3types.Config , addedOrChangedUnits []ign3types.Unit , skipCertificateWrite , forceFilePresent bool ) error {
18141820 klog .Info ("Updating files" )
18151821 if err := dn .writeFiles (newIgnConfig .Storage .Files , skipCertificateWrite ); err != nil {
18161822 return err
18171823 }
1818- if err := dn .writeUnits (addedOrChangedUnits ); err != nil {
1824+
1825+ // With OCPBUGS-58023, we updated this flow to only write units that were either added or
1826+ // updated. As can be seen in OCPBUGS-74692, this impacted the traditional method to recover
1827+ // from config drifts with systemd units. It makes the `touch /run/machine-config-daemon-force`
1828+ // command useless since the new flow does not rewrite all files, only the ones that have been
1829+ // added or changed with the latest MC. To keep the fix for OCPBUGS-58023 and allow continue
1830+ // supporting the traditional config drift recovery for systemd units, all units should be
1831+ // written when a forcefile exists.
1832+ unitsToWrite := addedOrChangedUnits
1833+ if forceFilePresent {
1834+ klog .Info ("Forcefile exists, writing all units" )
1835+ unitsToWrite = newIgnConfig .Systemd .Units
1836+ }
1837+
1838+ if err := dn .writeUnits (unitsToWrite ); err != nil {
18191839 return err
18201840 }
18211841 return dn .deleteStaleData (oldIgnConfig , newIgnConfig )
0 commit comments