\documentclass[11pt]{article} \usepackage{amsmath, amssymb, amsthm, graphicx} \usepackage[authoryear, round]{natbib} \usepackage{color} \usepackage{float} \def\post{{p}} \def\bR{\mathbb{R}} % real line \def\defn{{\stackrel{\rm def}{=}}} \newcommand{\CN}{{\mathcal N}} \def\bE{\mathbb{E}} % expectation \newtheorem{lemma}{Lemma} \newtheorem{theorem}{Theorem} \newcommand{\eps}{\varepsilon} \def\vec#1{\mathchoice{\mbox{\boldmath$\displaystyle\bf#1$}} {\mbox{\boldmath$\textstyle\bf#1$}} {\mbox{\boldmath$\scriptstyle\bf#1$}} {\mbox{\boldmath$\scriptscriptstyle\bf#1$}}} \begin{document} \section{Introduction} {\color{red} Can I learn anything about optimization theory as it relates to (say) sequentially compact sequences? How about regarding functions of a Euclidean and functional variable?} \subsection{Kernel smoothing operators} Let $\Omega\in \bR^r$ be a compact set and let $L_1^+(\Omega)$ denote the set of positive functions that are Lebesgue integrable on $\Omega$. For some vector $\vec h\in \bR^r$ of positive values and $\vec x \in \Omega$, let $K_{\vec h}(\vec x) = \prod_{i=1}^r \frac1{h_i} K \left( \frac{x}{h_i} \right)$ be a product kernel density function, where $K(\cdot)$ is some fixed nonnegative kernel density function on $\bR$. Frequently, we take $\vec h$ to be $(h, \ldots, h)^\top$ and write $K_h(\vec x)$. For any $\phi\in L_1^+(\Omega)$, we define the operator ${\cal S}:L_1^+(\Omega)\to L_1^+(\Omega)$ by ${\cal S}\phi(\vec x) = \int_\Omega K_h(\vec x - \vec u) \phi(\vec u)\, d\vec u.$ By this definition, ${\cal S}$ is a linear operator on $L_1^+(\Omega)$ and it depends implicitly on $K(\cdot)$ and $h$ (or $\vec h$). Similarly, we define a nonlinear operator ${\cal N}:L_1^+(\Omega)\to L_1^+(\Omega)$ by \begin{eqnarray*} {\cal N}\phi(\vec x) &=& \exp\left\{ \int_\Omega K_h(\vec x - \vec u) \log \phi(\vec u) \, d\vec u \right\}. \end{eqnarray*} Each $\phi\in L_1^+(\Omega)$, since it is a positive function defined on a compact set, must be bounded away from zero, which means that $|\log\phi|$ is bounded and ${\cal N}\phi(\vec x)$ is finite. Here are two inequalities satisfied by these operators, as proved by Eggermont and Lariccia (1995, lemma 7.1). First, $$\label{ELineq1} {\cal N}\phi(\vec x)\le \{S\sqrt{\phi} (\vec x)\}^{2} \le \{S\phi(\vec x)\}$$ for all $\phi\in L_1^+(\Omega)$ and almost all $\vec x\in\Omega$. The outer inequality relating ${\cal N}\phi$ and ${\cal S}\phi$ follows from the arithmetic-geometric mean inequality, and it has $\int_\Omega {\cal N}\phi(\vec x)\,d\vec x \le \int_\Omega \phi(\vec x)\,d\vec x$ as a corollary. Secondly, we have $$\label{ELineq2} {\cal S}\phi(\vec x) - 2\{S\sqrt{\phi} (\vec x)\}^{2} + {\cal N}\phi(\vec x) \ge 0$$ for all $\phi\in L_1^+(\Omega)$ and almost all $\vec x\in\Omega$. \subsection{A subset of functions} We will consider a subset ${\cal C} \subset L_1^+(\Omega)$ that consists of all $\phi\in L_1(\Omega)$ such that: \begin{enumerate} \item $\int_\Omega \phi(\vec x) \, d\vec x = 1$. \item $\epsilon\le \phi(\vec x)\le M$ for some positive constants $\epsilon$ and $M$. \end{enumerate} The rationale for this choice of ${\cal C}$ is that we wish to consider only density functions and we want ${\cal C}$ to be a compact subset of $L_1^+(\Omega)$. {\bf Remarks:\ } \begin{enumerate} \item Although action on ${\cal C}$ by the operator ${\cal S}$ does produce a function whose integral equals 1 (by Fubini's theorem), it is not necessarily the case that ${\cal C}$ is closed under action by ${\cal S}$ because of the condition $\epsilon\le \phi(\vec x)\le M$. \item The set ${\cal C}$ is not closed under action by ${\cal N}$, since ${\cal N}\phi(\vec x)$ does not necessarily integrate to one. \item The choice of $L_1(\Omega)$ is motivated by the fact that Cauchy sequences in ${\cal C}$ have limits that are in ${\cal C}$; on the other hand, it is possible to construct a sequence of density functions in, say, $L_2(\Omega)$ whose limit is not a density function even though the sequence is Cauchy (Eggermont \& Lariccia, 2001, p.~16). \end{enumerate} \subsection{Semiparametric mixtures} In the semiparametric multivariate mixture model setting, the parameters consist of a vector $\vec\lambda\in \bR^m$ of mixture weights, satisfying $\lambda_j\ge0$ and $\sum_j\lambda_j=1$, and corresponding functions $f_1, \ldots, f_m\in{\cal C}$. We shall write $\vec f=(f_1, \ldots, f_m)$. For a particular mixture weight vector $\vec\lambda$, we define the mixture operator ${\cal M}_{\vec\lambda}$, applied to a vector ${\cal N}\vec f = ({\cal N} f_1, \ldots, {\cal N} f_m)$ of nonlinearly smoothed functions, as ${\cal M}_{\vec\lambda} {\cal N} \vec f (\vec x) = \sum_{j=1}^m \lambda_j {\cal N} f_j (\vec x).$ Let us take $g(\vec x)\in{\cal C}$. %to be some density on $\Omega$. Given parameters $\vec\lambda$ and $\vec f$, we define $$\label{KL} L_{\infty}(\vec\lambda, \vec f) \defn -\int_\Omega g(\vec x)\log ([{\cal M}_{\vec\lambda} {\cal N}(\vec f)](\vec x))\,d\vec x, %+\int_\Omega ([{\cal M}_{\vec\lambda} {\cal N}(\vec f)](\vec x))\,d\vec x ,$$ a function that may be viewed as a penalized Kullback-Leibler divergence %, up to an additive constant depending only on $g$, between $g$ and the mixed, nonlinearly smoothed $\vec f$. This is due to the fact that this divergence may be expressed as $\int_\Omega g(\vec x)\log \left( \frac{g(\vec x)} {[{\cal M}_{\vec\lambda} {\cal N}(\vec f)](\vec x)} \right) \,d\vec x +\int_\Omega \left( [{\cal M}_{\vec\lambda} {\cal N}(\vec f)](\vec x) - g(\vec x) \right)\,d\vec x,$ where the second integral is negative due to Inequality~\ref{ELineq1} and may be viewed as the penalization term. {\color{red}{\em from Dave: I wonder whether this interpretation of the $L_\infty$ function makes any sense. What types of functions $f$ will be more penalized, and what types will be less penalized?}} {\bf I believe this means that the difference between the target density $g(\vec x)$ and a sum of reweighted smoothed density components is being penalized. Depending on the choice of bandwidth $h,$ the second integral term in the above can be more or less smooth and, therefore, the difference between $g(x)$ and the reweighted mixture" will be smaller or larger, respectively. Since it is impossible to fit "spikes" for individual components in the "continuous" case, this argument seems to work well here} We may also view $-L_\infty$ as a smoothed log-likelihood function corresponding to an infinitely large sample from $g$. Our general aim will be to minimize $L_\infty$ as a function of $\vec \lambda$ and $\vec f$. Starting with parameters $\vec\lambda^0$ and $\vec f^0$, Levine et al (2011, section 3) show that the iterative algorithm \begin{eqnarray}\label{algorithm} f^{p+1}_j(\vec u) &\equiv& \left[\int_\Omega \frac{g(\vec x)}{{\cal M}_{\vec\lambda^p}{\cal N} \vec f^p(\vec x)}\lambda_j^p{\cal N}f^p_{j}(\vec x)K_{h}(\vec x-\vec u)\,d\vec x\right] \end{eqnarray} for $j=1, \ldots, m$ and $p=0, 1, \ldots$ possesses the descent property \begin{equation*} L_\infty(\vec\lambda^p, \vec f^{p+1}) \le L_\infty(\vec\lambda^p, \vec f^p). \end{equation*} {\color{red}{\em from Dave: Equation~(\ref{algorithm}) probably needs to be modified because the $f_j^{p+1}$ must integrate to one. Also, even after normalizing, $\vec f^{p+1}$ need not be contained in $\cal{C}$. I'm not sure how to get around this problem.}} Furthermore, additional development in Levine et al (2011, section 3) shows that if we update the $\vec\lambda$ parameter as \begin{eqnarray} \lambda_j^{p+1} &=& \left[\int_\Omega \frac{g(\vec x)}{{\cal M}_{\vec\lambda^p}{\cal N}\vec f^{p+1}(\vec x)}\lambda_j^p{\cal N}f^{p+1}_{j}(\vec x)\,d\vec x\right], \label{algorithm2} \end{eqnarray} we obtain $$\label{descent} L_\infty(\vec\lambda^{p+1}, \vec f^{p+1}) \le L_\infty(\vec\lambda^p, \vec f^{p+1}) \le L_\infty(\vec\lambda^p, \vec f^p).$$ The fact that Kullback-Leibler divergence between two distributions is zero if and only if the distributions coincide implies that equality in equation~(\ref{descent}) may only be attained if $(\vec\lambda^p, \vec f^p) =(\vec\lambda^{p+1}, \vec f^{p+1})$. %{\bf What about $\vec \lambda_{p}=\vec \lambda_{p+1}$} So if we assume the existence of $(\vec\lambda^S, \vec f^S)$, a global minimizer of $L_\infty$, then %letting $(\vec\lambda^p, \vec f^p)=(\vec\lambda^S, \vec f^S)$ %implies equality in~(\ref{descent}), it must be the case that $$\label{fixedpt} f^S_j(\vec u) \equiv \left[\int_\Omega \frac{g(\vec x)}{{\cal M}_{\vec\lambda^S}{\cal N}\vec f^S(\vec x)}\lambda_j^S{\cal N}f^S_{j}(\vec x)K_{h}(\vec x-\vec u)\,d\vec x\right]\!.$$ We shall use this fact in the proof of Theorem~\ref{prop2}. More specifically, we will define the constant $\alpha_j^S$ so that $$\label{alpha} \alpha_j^S f^S_j(\vec u) = \left[\int_\Omega \frac{g(\vec x)}{{\cal M}_{\vec\lambda^S}{\cal N}\vec f^S(\vec x)}\lambda_j^S{\cal N}f^S_{j}(\vec x)K_{h}(\vec x-\vec u)\,d\vec x\right]\!.$$ Integrating with respect to $\vec u$ in Equation~(\ref{alpha}), then summing over $j$, we find that $\sum_{j=1}^m \alpha_j^S = 1$. %{\it We still have not proved that $(\vec\lambda^S, \vec f^S)$ exists. %For Theorem~\ref{prop2}, let us merely assume that it does.} \section{Analogue of EL95 Result} %With Theorem~\ref{prop1} proved, we now examine how this theorem, which %describes the behavior of the function $L_\infty(\vec f)$ in the vicinity of %its minimizer $\vec f_S$, may be used. The idea of Eggermont and LaRiccia (1995) is to establish that $\| \vec f^* - \vec f_s \|_{L_1} \to 0 \quad\mbox{as h\to0},$ where $\vec f^*$ is the true density, and then that $\vec f^n \to \vec f_S$. This establishes that $\vec f^n \to \vec f^*$ as desired. Rates of convergence are also possible. The first step seems to be the following theorem: %However, in our case, we must treat the whole parameter $(\vec\lambda, \vec f)$ %instead of merely the density $\vec f$. Perhaps it is possible to prove an inequality similar %to~(\ref{ineq}). Let us first re-define $L_\infty$, as follows: %$$\label{loglik_infty} %L_{\infty}(\vec\lambda, \vec f) \defn -\int g(\vec x)\log ([{\cal M}_{\vec\lambda} {\cal N}(\vec f)](\vec x))\,d\vec x. %$$ %Also, note as shown in Levine et al (2011) %that $f_{S}$ is the fixed point of the iterative algorithm %$$\label{EM} %f^{p+1}_j(\vec u) \propto %\left[\int \frac{g(\vec x)}{{\cal M}_{\vec\lambda_S}{\cal N}\vec f^p(\vec x)}*{\cal N}f^p_{j}(\vec x)K_{h}(\vec x-\vec u)\,d\vec x\right]; %$$ %let us therefore define $\alpha_j$ to be the normalizing constant that ensures %$$\label{fixedpt} %f^S_j(\vec u) = \alpha_j %\left[\int \frac{g(\vec x)}{{\cal M}_{\vec\lambda_S}{\cal N}\vec f^S(\vec x)}*{\cal N}f^S_{j}(\vec x)K_{h}(\vec x-\vec u)\,d\vec x\right]. %$$ \begin{theorem}\label{prop2} Assume that $(\vec\lambda_S, \vec f_S)$ can be defined as the unique (up to label-switching) minimizer of $L_\infty(\vec\lambda, \vec f)$. %For all $(\vec\lambda, \vec f)$, l Let $r_S(\vec x) = g(\vec x)/{\cal M}_{\vec\lambda_S}{\cal N}f_{S}(\vec x)$. Then \begin{eqnarray}\label{newineq} L_\infty(\vec\lambda, \vec f) - L_\infty(\vec\lambda_S, \vec f_S) &\ge & \frac12 \left( \int_{\Sigma} \sqrt{r_{S}(\vec x)} \left| ({\cal M}_{\vec\lambda}{\cal N}\vec f- {\cal M}_{\vec\lambda_S}{\cal N}\vec f_{S})(\vec x) \right |\,d\vec x\right)^{2} \\ && +\frac{k^{2}}{8}\sum_{j=1}^{m}\frac{\lambda_{j}\alpha_{j}^{S}}{\lambda_{j}^{S}} \left\|f_{j}-f_{j}^{S} \right\|^{2}_{L^{1}(\Omega)} + \sum_{j=1}^m \alpha_j^S \left( 1 - \frac{\lambda_j }{\lambda_j^S} \right).\nonumber \end{eqnarray} \end{theorem} {\bf Proof} As a first step, \begin{eqnarray*} &&\left( \int_{\Sigma} \sqrt{r_{S}(\vec x)} \left| ({\cal M}_{\vec\lambda}{\cal N}\vec f- {\cal M}_{\vec\lambda_S}{\cal N}\vec f_{S})(\vec x) \right |\,d\vec x\right)^{2} \\ &=&\left( \int_{\Sigma} \sqrt{r_{S}(\vec x)}\sqrt{ \left [ ({\cal M}_{\vec\lambda}{\cal N}\vec f- {\cal M}_{\vec\lambda_S} {\cal N}\vec f_{S})(\vec x) \right ]^{2}}\,d\vec x\right)^{2}\\ &\le & \Bigg[ \int_{\Sigma} \sqrt{r_{S}(\vec x)}\sqrt{\left( \frac{4}{3}[{\cal M}_{\vec\lambda}{\cal N}f](\vec x) +\frac{2}{3}[{\cal M}_{\vec\lambda_S}{\cal N}f_{S}](\vec x)\right)} \\ &&\times \sqrt{\left({\cal M}_{\vec\lambda_S}{\cal N}f_{S}(\vec x) \log \frac{ [{\cal M}_{\vec\lambda_S}{\cal N}f_{S}](\vec x)}{[{\cal M}_{\vec\lambda} {\cal N}f](\vec x)}+{\cal M}_{\vec\lambda}{\cal N}f(\vec x) - {\cal M}_{\vec\lambda_S}{\cal N}f_{S}(\vec x)\right)}\,d\vec x \Bigg ]^{2} \end{eqnarray*} using the following inequality of Kemperman (1967), which is also used by Eggermont and LaRiccia (1995), though the latter state it incorrectly: $(u-v)^{2}\le \left(\frac{2}{3}u+\frac{4}{3}v\right)\left(u\log \frac{u}{v}+v-u\right).$ Continuing, the Cauchy-Schwartz inequality together with the fact that ${\cal N}\phi(\vec x) \le {\cal S}\phi(\vec x)$ allows us to bound the above expression above by \begin{eqnarray*} && \int \left( \frac{2}{3}[{\cal M}_{\vec\lambda}{\cal S}f](\vec x) +\frac{4}{3}[{\cal M}_{\vec\lambda_S}{\cal S}f_{S}](\vec x)\right)\, d\vec x \\ && \times \int \left[g(\vec x)\log \frac{[{\cal M}_{\vec\lambda_S}{\cal N}f_{S}](\vec x)}{[{\cal M}_{\vec\lambda}{\cal N}f](\vec x)}+r_{S}(\vec x)\left[{\cal M}_{\vec\lambda}{\cal N}f(\vec x) - {\cal M}_{\vec\lambda_S}{\cal N}f_{S}(\vec x)\right]\right]\,d\vec x \\ &= & 2 \int \left[g(\vec x)\log \frac{[{\cal M}_{\vec\lambda_S}{\cal N}f_{S}](\vec x)}{[{\cal M}_{\vec\lambda}{\cal N}f](\vec x)}+ r_{S}(\vec x){\cal M}_{\vec\lambda}{\cal N}f(\vec x) - g(\vec x) \right]\,d\vec x \\ & = &2 \left[ L_{\infty}(\vec\lambda, \vec f)-L_{\infty}(\vec\lambda_S, \vec f_{S}) \right] +2\int r_{S}(\vec x){\cal M}_{\vec\lambda}{\cal N}f(\vec x) \,d\vec x - 2, \end{eqnarray*} where the penultimate equality uses Fubini's theorem and the last uses the definition of $L_{\infty}$. We continue by writing \begin{eqnarray*} \int r_{S}(\vec x){\cal M}_{\vec\lambda}{\cal N}\vec f(\vec x) &=&\sum_{j=1}^{m}\lambda_{j}\int \frac{g(\vec x)}{{\cal M}_{\vec\lambda_S}{\cal N}\vec f^{S}(\vec x)}{\cal N} f_{j}^{S}(\vec x)\left[{\cal N}\left(\frac{f_{j}(\vec x)}{f_{j}^{S}(\vec x)}\right) \right]\,d\vec x \\ &=&\sum_{j=1}^{m}\lambda_{j}\int \frac{g(\vec x)}{{\cal M}_{\vec\lambda_S}{\cal N}\vec f^{S}(\vec x)}{\cal N} f_{j}^{S}(\vec x)\left[{\cal N}\left(\frac{f_{j}(\vec x)}{f_{j}^{S}(\vec x)}\right)-S\left(\frac{f_{j}(\vec x)}{f_{j}^{S}(\vec x)}\right)\right]\,d\vec x\\ && + \sum_{j=1}^{m}\lambda_{j}\int \frac{g(\vec x)}{{\cal M}_{\vec\lambda_S}{\cal N}\vec f^{S}(\vec x)}{\cal N} f_{j}^{S}(\vec x)S\left(\frac{f_{j}(\vec x)}{f_{j}^{S}(\vec x)}\right)\,d\vec x. \end{eqnarray*} The last term above can be rewritten using Fubini's theorem as $$\int\, d\vec u \sum_{j=1}^{m}\lambda_{j}\frac{f_{j}(\vec u)}{f_{j}^{S}(\vec u)}\left[\int \frac{g(\vec x)}{{\cal M}_{\vec\lambda^S}{\cal N}\vec f^{S}(\vec x)}{\cal N}f_{j}^{S}(\vec x)K_{h}(\vec x-\vec u)\,d\vec x\right],$$ which equals $\sum_{j=1}^m \frac{\lambda_j\alpha_j^S}{\lambda_j^S }$ by equation~(\ref{alpha}). Finally, we consider the term $$\label{lastterm} \sum_{j=1}^{m}\lambda_{j}\int \frac{g(\vec x)}{{\cal M}_{\vec\lambda_S}{\cal N} \vec f^{S}(\vec x)}{\cal N} f_{j}^{S}(\vec x)\left[{\cal N} \left(\frac{f_{j}(\vec x)}{f_{j}^{S}(\vec x)}\right)-S\left(\frac{f_{j}(\vec x)} {f_{j}^{S}(\vec x)}\right)\right]\,d\vec x.$$ Following Eggermont and LaRiccia (1995), denote $\phi_j=\sqrt{f_j/f_j^S}$. Then, since $N(\psi)\le \{S(\sqrt{\psi})\}^{2}$ for any $\psi$, the term in square brackets in~(\ref{lastterm}) is bounded above by $(S\phi_{j})^{2} - S(\phi_{j}^{2})$. Putting all of this together, we obtain \begin{eqnarray*} &&\frac12 \left( \int_{\Sigma} \sqrt{r_{S}(\vec x)} \left| ({\cal M}_{\vec\lambda}{\cal N}\vec f- {\cal M}_{\vec\lambda_S}{\cal N}\vec f_{S})(\vec x) \right |\,d\vec x\right)^{2} \\ & \le & L_\infty(\vec\lambda, \vec f) - L_\infty(\vec\lambda_S, \vec f_S) - 1 + \sum_{j=1}^m \frac{\lambda_j \alpha_j^S}{\lambda_j^S } \\ &-& \sum_{j=1}^{m}\lambda_{j}\int \frac{g(\vec x)}{{\cal M}_{\vec\lambda_S}{\cal N} \vec f^{S}(\vec x)}{\cal N} f_{j}^{S}(\vec x) \left ( [S(\phi^{2}_{j})](\vec x)-\{[S\phi_{j} ](\vec x)\}^{2} \, \right ) d\vec x. \end{eqnarray*} In order to continue, one needs to use the Lagrange Identity: $[S(\phi^{2}_{j})](\vec x)-\{[S\phi_{j}](\vec x)\}^{2}=\int s_{h}(\vec x,\vec y)s_{h}(\vec x,\vec z)(\phi_{j}(\vec y)-\phi_{j}(\vec z))^{2}\,d\vec y\,d\vec z$ Due to the Lagrange identity, we now have \begin{eqnarray*} &&\sum_{j=1}^{m}\lambda_{j}\int \frac{g(\vec x)}{{\cal M}_{\vec\lambda^S}{\cal N}\vec f^{S}(\vec x)}*{\cal N} f_{j}^{S}(\vec x) \left\{S(\phi_{j}^{2})-(S\phi_{j})^{2}\right\}\,d\vec x\\ &=&\sum_{j=1}^{m}\lambda_{j}\int \frac{g(\vec x)}{{\cal M}_{\vec\lambda^S}{\cal N}\vec f^{S}(\vec x)}{\cal N}f_{j}^{S}(\vec x) \left[\int s_{h}(\vec x,\vec y)s_{h}(\vec x,\vec z)(\phi_{j}(\vec y)-\phi_{j}(\vec z))^{2}\,d\vec y\,d\vec z\right]\,d\vec x\\ &=&\sum_{j=1}^{m}\lambda_{j}\int_{\Omega\times \Omega}(\phi_{j}(\vec y)-\phi_{j}(\vec z))^{2}\left[\int_{\Omega}\frac{g(\vec x)}{{\cal M}_{\vec\lambda^S}{\cal N}\vec f^{S}(\vec x)}{\cal N}f_{j}^{S}(\vec x)s_{h}(\vec x,\vec y)s_{h}(\vec x,\vec z)\,d\vec x\right] d\vec y d\vec z\\ &\ge & \sum_{j=1}^{m}\lambda_{j}\int_{\Omega \times \Omega}(\phi_{j}(\vec y)-\phi_{j}(\vec z))^{2}\frac{k^{2}}{\vert \Omega \vert}\left[\int_{\Omega}\frac{g(\vec x)}{{\cal M}_{\vec\lambda^S}{\cal N}f^{S}(\vec x)}{\cal N}f_{j}^{S}(\vec x)s_{h}(\vec x,\vec y)\,d\vec x\right]d\vec y d\vec z\\ &=&\sum_{j=1}^{m}\frac{\lambda_{j}\alpha_j^S}{\lambda_j^S}\int_{\Omega\times\Omega}(\phi_{j}(\vec y)-\phi_{j}(\vec z))^{2}\frac{k^{2}}{\vert \Omega\vert}f_{j}^{S}(\vec y)\,d{\vec y}\, d\vec z, \end{eqnarray*} where the last step follows from~(\ref{fixedpt}) as before. Replacing the outermost integral (in $\vec z$) with a minimum over all possible vectors $\mu \in R^{r},$ one obtains \begin{eqnarray*} &&\int_{\Omega\times\Omega}(\phi_{j}(\vec y)-\phi_{j}(\vec z))^{2}\frac{k^{2}}{\vert \Omega\vert}f_{j}^{S}(\vec y)\,d{\vec y}\, d\vec z \\ %&\ge &\frac{k^{2}}{\vert \Omega\vert} \int_{\Omega}\min_{\vec z} \int_{\Omega} (\phi_{j}(\vec y)-\phi_{j}(\vec z))^{2}f_{j}^{S}(\vec y)\,d{\vec y}\, d\vec z\\ &\ge & k^{2} \min_{\vec \mu} \int_{\Omega} (\phi_{j}(\vec y)-\mu)^{2}f_{j}^{S}(\vec y)\,d{\vec y}\\ &=& k^{2}\int_{\Omega} (\phi_{j}(\vec y)-\nu)^{2}f_{j}^{S}(\vec y)\,d{\vec y}, \end{eqnarray*} where the minimum is clearly achieved when $\mu\equiv \nu= \int_{\Omega}\phi_{j}(\vec y)f_{j}^{S}(\vec y)\,d\vec y=\int_{\Omega} \sqrt{f_{j}(\vec y)}\sqrt{f_{j}^{S}(\vec y)}\,d\vec y.$ Plugging in the true values of $\phi_{j}={\sqrt{f_{j}}}/{\sqrt{f_{j}^{S}}}$ and $\nu$, we find that \begin{eqnarray*} &&\int_{\Omega\times\Omega}(\phi_{j}(\vec y)-\phi_{j}(\vec z))^{2}\frac{k^{2}}{\vert \Omega\vert}f_{j}^{S}(\vec y)\,d{\vec y}\, d\vec z \\ &\ge &k^{2}\int_{\Omega}\left(\sqrt{f_{j}(\vec y)}-\nu \sqrt{f_{j}^{S}(\vec y)}\right)^{2}\,d\vec y=k^{2}%\sum_{j=1}^{m}\lambda_{j} \left\|\sqrt{f_{j}}-\nu\sqrt {f_{j}^{S}}\right\|^{2}_{L^{2}(\Omega)} \end{eqnarray*} Due to the above, $\nu \sqrt{f_{j}^{S}}$ can be viewed as the Euclidean projection (in the space $L_{2}(\Omega)$) of $\sqrt{f_{j}}$ onto the linear space spanned by $\sqrt{f_{j}^{S}};$ such a space is, essentially, a straight line in $L_{2}(\Omega).$ Therefore, on one hand, \begin{equation*} \left\| \sqrt{f_{j}}-\nu\sqrt {f_{j}^{S}} \right\| ^{2}_{L^{2}(\Omega)}\le \left\| \sqrt{f_{j}}-\sqrt {f_{j}^{S}}\right \|^{2}_{L^{2}(\Omega)} \end{equation*} for any $j=1,\ldots,m$. On the other hand, the constant $0< \nu=\int_{\Omega}\sqrt{f_{j}f_{j}^{S}}<1$ due to Cauchy-Schwarz inequality and the fact that both $f_{j}$ and $f_{j}^{S}$ are true densities. Therefore, $\left \| \sqrt{f_{j}}-\nu \sqrt {f_{j}^{S}} \right\|^{2}_{L^{2}(\Omega)}=1-\nu^{2}\ge 1-\nu=\frac{1}{2} \left\|\sqrt f_{j}-\sqrt {f_{j}^{S}} \right \|^{2}_{L^{2}(\Omega)}$ and we can now say that $\frac{1}{2} \left\|\sqrt{f_{j}}-\sqrt {f_{j}^{S}} \right\|^{2}_{L^{2}(\Omega)}\le \left\|\sqrt{f_{j}}-\nu\sqrt {f_{j}^{S}} \right\|^{2}_{L^{2}(\Omega)}.$ This, in turn, means that \begin{eqnarray*} &&\sum_{j=1}^{m}\lambda_{j}\int \frac{g(\vec x)}{{\cal M}_{\vec\lambda^S}{\cal N}\vec f^{S}(\vec x)}*{\cal N} f_{j}^{S}(\vec x) \left\{S(\phi_{j}^{2})-(S\phi_{j})^{2}\right\}\,d\vec x\\ &\ge& \frac{k^{2}}{2}\sum_{j=1}^{m}\frac{\lambda_{j}\alpha_j^S}{\lambda_j^S} \left\|\sqrt{f_{j}}-\sqrt{f_{j}^{S}} \right\|^{2}_{L^{2}(\Omega)}\\ &\ge & \frac{k^{2}}{8}\sum_{j=1}^{m} \frac{\lambda_{j}\alpha_j^S}{\lambda_j^S} \left\|f_{j}-f_{j}^{S} \right\|^{2}_{L^{1}(\Omega)}; \end{eqnarray*} the last inequality follows from the obvious fact that $\left\|f_{j}-f_{j}^{S} \right\|^{2}_{L^{1}(\Omega)}\le 4 \left\|\sqrt {f_{j}}-\sqrt{f_{j}^{S}} \right\|^{2}_{L^{2}(\Omega)}.$ We thus obtain %\begin{eqnarray*} %&&\frac12 \left( \int_{\Sigma} \sqrt{r_{S}(\vec x)} \left| ({\cal M}_{\vec\lambda}{\cal N}\vec f- %{\cal M}_{\vec\lambda_S}{\cal N}\vec f_{S})(\vec x) \right |\,d\vec x\right)^{2} \\ %& \le & %L_\infty(\vec\lambda, \vec f) - L_\infty(\vec\lambda_S, \vec f_S)-1 %+\sum_{j=1}^m \frac{\lambda_j \alpha_j^S}{\lambda_j^S } %\left\{ 1 - \frac{k^{2}}{8} \left\|f_{j}-f_{j}^{S} \right\|^{2}_{L^{1}(\Omega)} \right \}. %\end{eqnarray*} %{\bf It is more instructive to rewrite the above inequality as \begin{eqnarray*} L_\infty(\vec\lambda, \vec f) - L_\infty(\vec\lambda_S, \vec f_S) &\ge & \frac12 \left( \int_{\Sigma} \sqrt{r_{S}(\vec x)} \left| ({\cal M}_{\vec\lambda}{\cal N}\vec f- {\cal M}_{\vec\lambda_S}{\cal N}\vec f_{S})(\vec x) \right |\,d\vec x\right)^{2} \\ && +\frac{k^{2}}{8}\sum_{j=1}^{m}\frac{\lambda_{j}\alpha_{j}^{S}}{\lambda_{j}^{S}} \left\|f_{j}-f_{j}^{S} \right\|^{2}_{L^{1}(\Omega)} + \sum_{j=1}^m \alpha_j^S \left( 1 - \frac{\lambda_j }{\lambda_j^S} \right). \end{eqnarray*} %{\it %I believe I have followed all of the algebra correctly. At this point, note that %if $\vec\lambda=\vec\lambda^s$, things simplify considerably: %\begin{eqnarray*} %&&\frac12 \left( \int_{\Sigma} \sqrt{r_{S}(\vec x)} \left| ({\cal M}_{\vec\lambda}{\cal N}\vec f- %{\cal M}_{\vec\lambda_S}{\cal N}\vec f_{S})(\vec x) \right |\,d\vec x\right)^{2} \\ %& \le & %L_\infty(\vec\lambda, \vec f) - L_\infty(\vec\lambda_S, \vec f_S) %-\frac{k^2}{8}\sum_{j=1}^m %\alpha_j^S \left\|f_{j}-f_{j}^{S} \right\|^{2}_{L^{1}(\Omega)}, %\end{eqnarray*} %where once again we must remember that $\sum_j\alpha_j^S=1$. %Given this fact, it should be possible to express the final inequality using %some sort of bound on the distance between $\vec\lambda$ and $\vec\lambda^S$. %} \section{Convergence of $\vec f_{n}$ to $\vec f_{S}$} Consider two different sets of parameters $\theta_{1}=(\vec \lambda^{1},\vec f^{1})$ and $\theta_{2}=(\vec \lambda^{2},\vec f^{2}).$ Recall that $L_{n}(\vec \lambda, \vec f)=-\sum_{i=1}^{n}\log\{({\cal M_{\vec \lambda}}{\cal N}\vec f )(\vec x_{i})\}.$ Let $(\vec \lambda_{n},\vec f_{n})$ be a minimizer of $L_{n}(\vec \lambda,\vec f).$ Define $L_{n}(\vec \lambda_{1},\vec f^{1},\vec \lambda^{2},\vec f^{2})=L_{n}(\vec \lambda^{1}, \vec f^{1})-L_{n}(\vec \lambda^{2},\vec f^{2})$ and likewise for $L_{\infty}(\vec \lambda_{1},\vec f^{1},\vec \lambda^{2},\vec f^{2}).$ More explicitly, $$\label{4argfunc1} L_{n}(\vec \lambda^{1},\vec f^{1},\vec \lambda^{2},\vec f^{2})=\sum_{k=1}^{n}\log{ \frac{\sum_{j=1}^{m}\lambda^{1}_{j}{\cal N}\vec f^{1}_{j}(\vec x_{k})}{\sum_{j=1}^{m}\lambda^{2}_{j}{\cal N}\vec f^{2}_{j}(\vec x_{k})}}$$ and $$\label{4argfunc2} L_{\infty}(\vec \lambda_{1},\vec f_{1},\vec \lambda_{2},\vec f_{2})= \int g(\vec x)\log{ \frac{\sum_{j=1}^{m}\lambda^{1}_{j}{\cal N}\vec f^{1}_{j}(\vec x)}{\sum_{j=1}^{m}\lambda^{2}_{j}{\cal N}\vec f^{2}_{j}(\vec x)}\,d\vec x}.$$ Therefore, the difference $$L_{n}(\vec \lambda_{1},\vec f_{1},\vec \lambda_{2},\vec f_{2})-L_{\infty}(\vec \lambda_{1},\vec f_{1},\vec \lambda_{2},\vec f_{2})= \int_{\Sigma} \Phi(\vec x)[dG_{n}(\vec x)-dG(\vec x)]$$ where $\Phi(\vec x)=\log{ \frac{\sum_{j=1}^{m}\lambda^{1}_{j}{\cal N}\vec f^{1}_{j}(\vec x)}{\sum_{j=1}^{m}\lambda^{2}_{j}{\cal N}\vec f^{2}_{j}(\vec x)}}$ $G_{n}(\vec x)$ is an empirical sampling distribution function and $G(\vec x)$ is the distribution function corresponding to the target density $g(\vec x).$ The reason we are concerned with this is because the behavior of the functional $L_{\infty}$ in the vicinity of its minimizer can be described as \begin{align}\label{diff} &L_{\infty}(\vec \lambda_{n},\vec f_{n})-L_{\infty}(\vec \lambda_{S},\vec f_{S})=-L_{\infty}(\vec \lambda_{S},\vec f_{S},\vec \lambda_{n},\vec f_{n})\\\nonumber &\le L_{n}(\vec \lambda_{S},\vec f_{S},\vec \lambda_{n},\vec f_{n})-L_{\infty}(\vec \lambda_{S},\vec f_{S},\vec \lambda_{n},\vec f_{n})\\\nonumber &=\int_{\Sigma} \Phi(\vec x)[dG_{n}(\vec x)-dG(\vec x)] \end{align} using definitions \eqref{4argfunc1} and \eqref{4argfunc2}. In the above, $\Phi(\vec x)=\log{ \frac{\sum_{j=1}^{m}\lambda^{S}_{j}{\cal N}\vec f^{S}_{j}(\vec x)}{\sum_{j=1}^{m}\lambda^{n}_{j}{\cal N}\vec f_{j}^{n}(\vec x)}}.$ \hrule {\em New (May 11, 2012):} Combining Theorem~1 with inequality~(\ref{diff}) gives \begin{eqnarray*} && K\sum_{j=1}^m \frac{\alpha^S_j\lambda^n_j}{\lambda^S_j} \| f_j^n - f_j^S \|_1 + \frac12 \left( \int \sqrt{r_S(\vec x)} \left| {\cal M}_{\lambda^n} {\cal N} \vec f^n (\vec x) - {\cal M}_{\lambda^S} {\cal N} \vec f^S (\vec x) \right| \, d\vec x \right)^2 \\ &\le & L_\infty(\vec\lambda^n, \vec f^n) - L_\infty(\vec\lambda^S, \vec f^S) + \frac{\max_j \alpha_j^S}{\min_j \lambda_j^S} \| \vec\lambda^n - \vec\lambda^S \|_1 \\ &\le & \int \log \frac{ {\cal M}_{\lambda^S} {\cal N} \vec f^S (\vec x) } {{\cal M}_{\lambda^n} {\cal N} \vec f^n (\vec x)} \left[ dG_n (\vec x) - dG (\vec x) \right] + \frac{\max_j \alpha_j^S}{\min_j \lambda_j^S} \| \vec\lambda^n - \vec\lambda^S \|_1 \\ &\le& \left( 2+ \frac{\max_j \alpha_j^S}{\min_j \lambda_j^S} \right) \| \vec\lambda^n - \vec\lambda^S \|_1 + \int | {\cal N}\vec f^n(\vec x) - {\cal N} \vec f^S(\vec x) | \left[ dG_n(\vec x) - dG(\vec x) \right] \end{eqnarray*} as long as it is possible to show that $| \Phi(\vec x) | \le \| \vec\lambda^n - \vec\lambda^S \|_1 + | {\cal N} \vec f^n (\vec x) - {\cal N} \vec f^S (\vec x) |.$ {\em Michael, I believe this latter fact is what you have shown, correct?} \hrule Our next step will be to analyze the function $\Phi(\vec x)$ in detail; note that the $\sum_{j=1}^{m}\lambda^{S}_{j}{\cal N}\vec f^{S}_{j}(\vec x)$ can be thought of as a trace of the positive definite diagonal matrix with a typical element $\lambda^{S}_{j}{\cal N}\vec f^{S}_{j}(\vec x)$ and the same is true of the denominator in the definition of $\Phi(\vec x).$ Since for any two positive definite square matrices $\frac{ tr(B)}{tr (A)} \le tr (A^{-1}B)$ (see, e.g. DasGupta (2008)) we have \begin{align} &\Phi(\vec x) \le \log \left( \sum_{j=1}^{m}\frac{\lambda^{S}_{j}}{\lambda^{n}_{j}}\exp\left(\int_{\Omega}K_{h}(\vec x -\vec u)[\log f^{S}_{j}(\vec x)-\log f^{n}_{j}(\vec x)]\right)\right)\\ &= \log \left( \sum_{j=1}^{m}\frac{\lambda^{S}_{j}}{\lambda^{n}_{j}}{\cal N}\left\{\frac{f_{j}^{S}(\vec x)}{f_{j}^{n}(\vec x)}\right\}\right)\equiv \Psi(\vec x) \end{align} %First, using Jensen's inequality one can show that $Nf_{S}^{j}(\vec x)=\exp(\int K_{h}(\vec x-\vec u)\log f_{S}^{j}(\vec u)\,d\vec u\le \int %K_{h}(\vec x-\vec u)f_{S}^{j}(\vec u)\,d\vec u\le \max_{\vec x\in \Omega} K_{h}(\vec x)=K_{1}.$ This allows us to conclude that %$\sum_{j=1}^{m}\lambda^{j}_{S}{\cal N}f^{j}_{S}\le K_{1}.$ %Note, first, that it can be bounded from the above as %\begin{align}\label{phi} %\Phi(\vec x)&\le \log\left\{K*\left(\sum_{j=1}^{m}[\lambda_{j}^{n}{\cal N}f_{j}^{n}(\vec x)-\lambda_{j}^{S}{\cal N}f_{j}^{S}(\vec %x)]\right)+1\right\}\\ %&=K*\left(\sum_{j=1}^{m}[\lambda_{j}^{n}{\cal N}f_{j}^{n}(\vec x)-\lambda_{j}^{S}{\cal N}f_{j}^{S}(\vec x)]\right)+o()\equiv \Psi(\vec x) %\end{align} %for some positive constant $K.$ %On the other hand, for any density function $f_{j}\in B$ (see Levine, Hunter and Chauveau (2011)), $f_{j}\ge \inf K_{h}(\vec x)\ge K_{2};$ therefore, %${\cal N}f_{j}^{S}(\vec x)\ge \exp[\int K_{h}(\vec x-\vec u)\log K_{2}\,d\vec u]\ge K_{2}>0.$ In the limit, we can also say that ${\cal N}f_{j}^{S}$ %is bounded away from zero as well. Since we assume that the weights $\lambda_{j}^{S}$ are bounded away from zero, we have that %$\sum_{j=1}^{m}\lambda_{j}^{S}{\cal N}f_{j}^{S}$ is bounded away from zero; thus, we can bound $\Phi(\vec x)$ from above regardless of the sign of %the denominator. Thus, \begin{align}\label{int_by_parts} &L_{n}(\vec \lambda_{1},\vec f_{1},\vec \lambda_{2},\vec f_{2})-L_{\infty}(\vec \lambda_{1},\vec f_{1},\vec \lambda_{2},\vec f_{2})\le \int_{\Sigma}\Psi(\vec x)[dG_{n}(\vec x)-dG(\vec x)]\nonumber\\ &=(-1)^{r}\int_{\Sigma}D\Psi(\vec x)[G_{n}(\vec x)-G(\vec x)]\,d\vec x \end{align} where $D\Psi(\vec x)=\frac{{\partial }^{r}}{{\partial }x_{1}{\partial }x_{2}\ldots {\partial }x^{r}}\Psi(\vec x)$ with $r$ being the dimensionality of the space $\Sigma.$ The reason for using integration by parts in \eqref{int_by_parts} is that it is easy to characterize the rate of convergence of $G_{n}(\vec x)$ to $G(\vec x)$ using the empirical process theory. Indeed, due to a well known estimate of Shorack and Wellner, $||G_{n}(\vec x)-G(\vec x)||_{L_{1}(\Sigma)} =O\left( \sqrt{\frac{\log\log n}{n}}\right)\equiv LL(n).$ Therefore, it seems advantageous to present the right-hand side of \eqref{diff} in the form \eqref{int_by_parts}. Now, note that differentiation of ${\cal N}\left\{\frac{f_{j}^{S}(\vec x)}{f_{j}^{n}(\vec x)}\right\}$ w.r.t. any coordinate $x_{k},$ $k=1,\ldots,r$ produces $\frac{\partial }{\partial x^{k}}{\cal N}\left\{\frac{f_{j}^{S}(\vec x)}{f_{j}^{n}(\vec x)}\right\}= {\cal N}\left\{\frac{f_{j}^{S}(\vec x)}{f_{j}^{n}(\vec x)}\right\}\int \frac{\partial }{\partial x^{k}}K_{h}(\vec x-\vec u)[\log f_{j}^{S}(\vec u)-\log f_{j}^{n}(\vec u)]\, d\vec u.$ If we assume that the $k$th partial derivative of the kernel $K(\cdot)$ is continuous on the compact set $\Omega$, it is easy to see that the above does not exceed $C_{1}{\cal N}\left\{\frac{f_{j}^{S}(\vec x)}{f_{j}^{n}(\vec x)}\right\}$ for some positive constant $C_{1}.$ ********************To be continued*********** %One can easily show that the same is true for the $r$ th order derivative of ${\cal N}f_{j}^{n}(\vec x)$ as well. On the other hand, given that $\log f_{S}$ is integrable on the compact set $\Omega,$ one can also show that the $r$th order partial derivative of ${\cal N}f_{j}^{S}(\vec x)$ is bounded from below. Therefore, there exists a constant $C$ that does not depend on $n$ and $h$ such that $D\Psi(\vec x) \le C[\lambda_{j}^{n}{\cal N}f_{j}^{n}(\vec x)-\lambda_{j}^{S}{\cal N}f_{j}^{S}(\vec x);$ thus, $L_{\infty}(\vec \lambda_{n},\vec f_{n})-L_{\infty}(\vec \lambda_{S},\vec f_{S})=C\int [\sum_{j=1}^{m}(\lambda_{j}^{n}{\cal N}f_{j}^{n}(\vec x)-\lambda_{j}^{n}{\cal N}f_{j}^{n}(\vec x))]\,d\vec x*LL(n)$ Now, define a set $\Omega_{j}^{+}=\{\lambda_{j}^{n}{\cal N}f_{j}^{n}(\vec x)\ge \lambda_{j}^{S}{\cal N}f_{j}^{S}(\vec x)\}$ and $\Omega^{-}_{j}=\Omega\setminus\Omega_{j}^{+}.$ Then, \begin{align*} &\int_{\Omega_{j}^{+}}[\sum_{j=1}^{m}(\lambda_{j}^{n}{\cal N}f_{j}^{n}(\vec x)-\lambda_{j}^{n}{\cal N}f_{j}^{n}(\vec x))]\\ &=\int_{\Omega_{j}^{+}}[\lambda_{j}^{n}{\cal N}f_{j}^{n}(\vec x)-\lambda_{j}^{n}Sf_{j}^{n}(\vec x)]\,d\vec x+\int_{\Omega_{j}^{+}}\lambda_{j}^{n}S[f_{j}^{n}(\vec x)-f_{j}^{S}(\vec x)]\,d\vec x+\int_{\Omega_{j}^{+}} [\lambda_{j}^{n}Sf_{j}^{n}(\vec x)-\lambda_{j}^{S}{\cal N}f_{j}^{S}\vec (x)]\,d\vec x \end{align*} The first integral out of three above is negative since ${\cal N}f_{j}^{n}(\vec x)\le Sf_{j}^{n}(\vec x);$ for the same reason the last integral is positive. If we define $\Lambda f_{j}^{S}=h^{-2}\int_{\Omega}[\lambda_{j}^{n}Sf_{j}^{S}-\lambda_{j}^{S}{\cal N}f_{j}^{S}](\vec x)\,d\vec x$ as a linear operator with $\sup \Lambda f_{j}^{S}\le M$ for some nonnegative constant $M,$ we have $\int_{\Omega_{j}^{+}}[\sum_{j=1}^{m}(\lambda_{j}^{n}{\cal N}f_{j}^{n}(\vec x)-\lambda_{j}^{n}{\cal N}f_{j}^{n}(\vec x))]\le ||S( \vec f^{n}-\vec f_{S})||_{L_{1}(\Omega)}+h^{2}M$ In exactly the same way, one can show that \begin{align*} &\int_{\Omega_{j}^{+}}[\sum_{j=1}^{m}(\lambda_{j}^{n}{\cal N}f_{j}^{n}(\vec x)-\lambda_{j}^{n}{\cal N}f_{j}^{n}(\vec x))]\\ &=||S(\vec f^{n}-\vec f_{S})||_{L_{1}(\Omega)}+h^{2}M+\int \sqrt{r_{S}(\vec x)}[\lambda_{j}^{n}{\cal N}f_{j}(\vec x)-\lambda_{j}^{S}{\cal N}f_{j}^{S}(\vec x)]\,d\vec x \end{align*} Thus, we can conclude that \begin{align*} &L_{\infty}(\vec \lambda_{n},\vec f_{n})-L_{\infty}(\vec \lambda_{S},\vec f_{S})\le C*LL(n)\sum_{j=1}^{m}||f_{j}^{n}-f_{j}^{S}||_{L^{1}(\Omega)}+Mh^{2}\\ &+\int \sqrt{r_{S}}(\vec x)\{\lambda_{j}^{n}{\cal N}f_{j}^{n}(\vec x)-\lambda_{j}^{s}{\cal N}f_{j}^{S}(\vec x)\}\,d\vec x \end{align*} Let $E\equiv \int \sqrt{r_{S}(\vec x)}\sum_{j=1}^{m}[\lambda_{j}^{n}{\cal N}f_{j}(\vec x)-\lambda_{j}^{S}{\cal N}f_{j}^{S}(\vec x)]\,d\vec x$ and $e_{j}=||f_{j}^{n}-f_{j}^{S}||_{L_{1}(\Omega)}$for brevity. Finally, using inequality \eqref{newineq}, one obtains that \begin{align*} &\frac{1}{2}E^{2}+1-\sum_{j=1}^{m}\frac{\lambda_{j}\alpha_{j}^{S}}{\lambda_{j}^{S}}+\frac{k^{2}}{8}\sum_{j=1}^{m}\frac{\lambda_{j}\alpha_{j}^{S}}{\lambda_{j}^{S}}e_{j}^{2}\\ &\le C*LL(n)\sum_{j=1}^{m}[e_{j}+Mh^{2}+E] \end{align*} The terms with $E$ can be completed to a full square on the left hand side and the resulting full square dropped due to its positivity, thus obtaining an expression for the rate of convergence for $f_{j}^{n}$ to $f_{j}^{S}$ if something preliminary is known about convergence of Euclidean parameters. \end{document}