#include <stdio.h>

/*
 * All of this code is basically necessary in order to trigger the problem.
 * It seems that if you don't shuffle enough with the floating point registers, this simply won't do anything.
 * Note that this problem is being triggered whatever the environment is: 32 / 64 bits with or without SSE.
 * I haven't tried anything else than Intel platforms though. See comments on main() for more details.
 */

class Vector3 {
  public:
    float x, y, z;

    Vector3(float x, float y, float z) : x(x), y(y), z(z) { }
    Vector3 & operator*=(float a) {
        x *= a; y *= a; z *= a;
        return *this;
    }
    Vector3 operator-() const { return Vector3(-x, -y, -z); }
};

class Matrix4;

class Matrix3 {
  public:
    float a0, a1, a2, b0, b1, b2, c0, c1, c2;

    Matrix3(float a0, float a1, float a2,
            float b0, float b1, float b2,
            float c0, float c1, float c2) :
            a0(a0), a1(a1), a2(a2),
            b0(b0), b1(b1), b2(b2),
            c0(c0), c1(c1), c2(c2) { }

    explicit Matrix3(const Matrix4& m);

    inline Matrix3 Transpose() const { return Matrix3(a0, b0, c0, a1, b1, c1, a2, b2, c2);}
};

class Matrix4 {
  public:
    float a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3;

    Matrix4() :
        a0(1.0f), a1(0.0f), a2(0.0f), a3(0.0f),
        b0(0.0f), b1(1.0f), b2(0.0f), b3(0.0f),
        c0(0.0f), c1(0.0f), c2(1.0f), c3(0.0f),
        d0(0.0f), d1(0.0f), d2(0.0f), d3(1.0f) { }

    explicit Matrix4(const Matrix3& m) :
        a0(m.a0), a1(m.a1), a2(m.a2), a3(0.0f),
        b0(m.b0), b1(m.b1), b2(m.b2), b3(0.0f),
        c0(m.c0), c1(m.c1), c2(m.c2), c3(0.0f),
        d0(0.0f), d1(0.0f), d2(0.0f), d3(1.0f) { }

    /* This is the most offending part: this is a bad case of pointer aliasing. This is being used in Scale later. */
    Vector3 *Row0AsVec3() { return (Vector3 *) &a0; }
    Vector3 *Row1AsVec3() { return (Vector3 *) &b0; }
    Vector3 *Row2AsVec3() { return (Vector3 *) &c0; }
    Vector3 *Row3AsVec3() { return (Vector3 *) &d0; }
    const Vector3 *Row0AsVec3() const { return (Vector3 *) &a0; }
    const Vector3 *Row1AsVec3() const { return (Vector3 *) &b0; }
    const Vector3 *Row2AsVec3() const { return (Vector3 *) &c0; }
    const Vector3 *Row3AsVec3() const { return (Vector3 *) &d0; }

    friend Vector3 operator*(const Vector3 & v, const Matrix4 & r);

    void Translate(const Vector3 & move);
    void Scale(float scale);

    Matrix4 AffineInverse(float uniformScale) const;
};

Matrix3::Matrix3(const Matrix4& m) {
    a0 = m.a0; a1 = m.a1; a2 = m.a2;
    b0 = m.b0; b1 = m.b1; b2 = m.b2;
    c0 = m.c0; c1 = m.c1; c2 = m.c2;
}

Vector3 operator*(const Vector3& v, const Matrix4& r) {
    return Vector3(
        v.x * r.a0 + v.y * r.b0 + v.z * r.c0 + r.d0,
        v.x * r.a1 + v.y * r.b1 + v.z * r.c1 + r.d1,
        v.x * r.a2 + v.y * r.b2 + v.z * r.c2 + r.d2);
}

void Matrix4::Translate(const Vector3 & move) {
    d0 += move.x * a0 + move.y * b0 + move.z * c0;
    d1 += move.x * a1 + move.y * b1 + move.z * c1;
    d2 += move.x * a2 + move.y * b2 + move.z * c2;
}

/*
 * This is the problematic function: when gcc is going to optimize it by inlining Scale in it,
 * it seems it's going to think that only a0, b0 and c0 are being modified, and won't taint
 * a1, a2, b1, b2, c1, c2 so that the old values will be kept before and after; also note the
 * second usage of pointer aliasing there.
 */

Matrix4 Matrix4::AffineInverse(float uniformScale) const {
    Matrix4 matrix(Matrix3(*this).Transpose());

    matrix.Scale(1.0f / (uniformScale * uniformScale));
    matrix.Translate(-*Row3AsVec3());
    return matrix;
}

/*
 * Here's the biggest problem. Changing this method to avoid pointer aliasing will make the whole thing work.
 */

void Matrix4::Scale(float scale) {
    *Row0AsVec3() *= scale;
    *Row1AsVec3() *= scale;
    *Row2AsVec3() *= scale;
}

/*
 * Simple testcase. A buggy gcc (4.2.1 to 4.3.2 or the redhat ones) will display "x = 2.857178, y = 316.570801, z = -24.159454",
 * whereas a working one (4.4.0+ or 4.2.0-) will display "x = 2.857178, y = -11.428467, z = 1.904770"
 *
 * Note that with gcc 4.4.0+, you have to add -fno-strict-aliasing, otherwise you get "x = -247.555176, y = 301.496094, z = -23.009033",
 * which isn't the case at all with gcc 4.2.0-, or gcc 4.3.3 and 4.3.4.
 */

int main() {
    Vector3 A(2692, -3360, 267), B(2695, -3372, 269);
    float s = 1.05f;
    Matrix4 matrix;

    matrix.Translate(A);
    matrix.Scale(s);
    matrix = matrix.AffineInverse(s);
    Vector3 r = B * matrix;

    fprintf(stderr, "x = %f, y = %f, z = %f\n", r.x, r.y, r.z);
}

Generated by GNU enscript 1.6.4.