From 466e843e8b9017c56f294b574c0d675ccd488caf Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Sat, 28 Sep 2013 09:01:52 -0700 Subject: [PATCH] feeds: Make Feeds.save fully atomic, assuming a working fsync MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit If the disk is full (or there are other OS-level issues), a file may not be completely written to the disk. The write-flush-fsync-rename sequence is much safer. The fsync invocation matches the recommendation in the docs [1]: If you’re starting with a buffered Python file object f, first do f.flush(), and then do os.fsync(f.fileno()), to ensure that all internal buffers associated with f are written to disk. The purpose of each step is: * write: move the data into a library buffer * flush: flush the library buffer into a kernel buffer * fsync: flush the kernel buffer onto the disk at $tempfile * rename: adjust the metadata so that the $filename points to the $tempfile data, release the old data This means that if the rename works we get the new data, and if the rename fails we still have the old data. However, POSIX's fsync is implementation defined unless _POSIX_SYNCHRONIZED_IO is defined [3,4], and some OS X implementations go the no-op route, as Stewart Smith points out in his excellent "Eat My Data: How everybody gets file I/O wrong" [4]. If you want to run rss2email on such a system, verifying your data integrity is up to you ;). We used to write-rename the data file (but not the config) on *nix [2]. Now we do the full write-flush-fsync-rename for both the config and data files on both *nix and other systems. [1]: http://docs.python.org/3/library/os.html#os.fsync [2]: For rss2email, *nix is "has fcntl, but isn't SunOS [3]: http://pubs.opengroup.org/onlinepubs/009695399/functions/fsync.html [4]: https://www.flamingspork.com/talks/2007/06/eat_my_data.odp Reported-by: Etienne Millon Signed-off-by: W. Trevor King --- rss2email/feeds.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/rss2email/feeds.py b/rss2email/feeds.py index 2de739c..edd5f66 100644 --- a/rss2email/feeds.py +++ b/rss2email/feeds.py @@ -344,8 +344,12 @@ class Feeds (list): dirname = _os.path.dirname(self.configfiles[-1]) if dirname and not _os.path.isdir(dirname): _os.makedirs(dirname, mode=0o700, exist_ok=True) - with open(self.configfiles[-1], 'w') as f: + tmpfile = self.configfiles[-1] + '.tmp' + with open(tmpfile, 'w') as f: self.config.write(f) + f.flush() + _os.fsync(f.fileno()) + _os.rename(tmpfile, self.configfiles[-1]) self._save_feeds() def _save_feeds(self): @@ -353,17 +357,15 @@ class Feeds (list): dirname = _os.path.dirname(self.datafile) if dirname and not _os.path.isdir(dirname): _os.makedirs(dirname, mode=0o700, exist_ok=True) - if UNIX: - tmpfile = self.datafile + '.tmp' - with _codecs.open(tmpfile, 'w', self.datafile_encoding) as f: - self._save_feed_states(feeds=self, stream=f) - _os.rename(tmpfile, self.datafile) - if self._datafile_lock is not None: - self._datafile_lock.close() # release the lock - self._datafile_lock = None - else: - with _codecs.open(self.datafile, 'w', self.datafile_encoding) as f: - self._save_feed_states(feeds=self, stream=f) + tmpfile = self.datafile + '.tmp' + with _codecs.open(tmpfile, 'w', self.datafile_encoding) as f: + self._save_feed_states(feeds=self, stream=f) + f.flush() + _os.fsync(f.fileno()) + _os.rename(tmpfile, self.datafile) + if UNIX and self._datafile_lock is not None: + self._datafile_lock.close() # release the lock + self._datafile_lock = None def _save_feed_states(self, feeds, stream): _json.dump( -- 2.26.2